diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
index 4cf99a4f78e..49b079047a3 100644
--- a/.ci/docker/ci_commit_pins/optimum-executorch.txt
+++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-bd06b54e627fbfd354a2cffa4c80fb21883209a9
+44d8d54e38c0258357d4e92e1fefe21e845947a3
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index e3a53c8bcb5..aafc7565373 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-53a2908a10f414a2f85caa06703a26a40e873869
+cf9d09490c7f6685ec68d5db3acf2e0d73c54d00
diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index 7f34e8afb63..30835cf5085 100755
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -38,6 +38,7 @@ set_up_aot() {
       -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM=ON \
       -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM_RUNNER=ON \
       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3
diff --git a/.ci/scripts/setup-openvino.sh b/.ci/scripts/setup-openvino.sh
index ff667619125..587494f46ac 100755
--- a/.ci/scripts/setup-openvino.sh
+++ b/.ci/scripts/setup-openvino.sh
@@ -10,19 +10,17 @@ set -ex
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-git clone https://github.com/openvinotoolkit/openvino.git
-cd openvino && git checkout releases/2025/1
-git submodule update --init --recursive
-sudo ./install_build_dependencies.sh
-mkdir build && cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release -DENABLE_PYTHON=ON
-make -j$(nproc)
+# Download and install OpenVINO from release packages
+OPENVINO_VERSION="2025.3"
+OPENVINO_BUILD="2025.3.0.19807.44526285f24"
+OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
 
-cd ..
-cmake --install build --prefix dist
+curl -Lo /tmp/openvino_toolkit.tgz --retry 3 --fail ${OPENVINO_URL}
+tar -xzf /tmp/openvino_toolkit.tgz
+mv openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64 openvino
 
-source dist/setupvars.sh
-cd ../backends/openvino
+source openvino/setupvars.sh
+cd backends/openvino
 pip install -r requirements.txt
 cd scripts
 ./openvino_build.sh --enable_python
diff --git a/.ci/scripts/setup-samsung-linux-deps.sh b/.ci/scripts/setup-samsung-linux-deps.sh
index 434587975ab..c1f2912713b 100644
--- a/.ci/scripts/setup-samsung-linux-deps.sh
+++ b/.ci/scripts/setup-samsung-linux-deps.sh
@@ -13,7 +13,7 @@ download_ai_lite_core() {
   API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
   API_KEY=$SAMSUNG_AI_LITECORE_KEY
 
-  VERSION="0.5"
+  VERSION="0.7"
   OS_NAME="Ubuntu 22.04"
   OUT_FILE="/tmp/exynos-ai-litecore-v${VERSION}.tar.gz"
   TARGET_PATH="/tmp/exynos_ai_lite_core"
@@ -62,7 +62,7 @@ install_enn_backend() {
   export PYTHONPATH=${PYTHONPATH:-}:${EXECUTORCH_ROOT}/..
 }
 
-AI_LITE_CORE_VERSION=0.5.0
+AI_LITE_CORE_VERSION=0.7.0
 
 download_ai_lite_core ${AI_LITE_CORE_VERSION}
 install_enn_backend
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index df98fb43372..a48cc9ec41a 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -58,6 +59,13 @@ fi
 if [[ "$FLOW" == *arm* ]]; then
     # Setup ARM deps.
     .ci/scripts/setup-arm-baremetal-tools.sh
+    source examples/arm/ethos-u-scratch/setup_path.sh
+
+    if [[ "$FLOW" == *ethos_u* ]]; then
+        # Prepare a test runner binary that can run on the Corstone-3x0 FVPs
+        backends/arm/scripts/build_executorch.sh
+        backends/arm/test/setup_testing.sh
+    fi
 fi
 
 if [[ $IS_MACOS -eq 1 ]]; then
diff --git a/.ci/scripts/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh
index a89c2cc5809..46c3f71f021 100755
--- a/.ci/scripts/test_ios_ci.sh
+++ b/.ci/scripts/test_ios_ci.sh
@@ -36,6 +36,7 @@ say() {
 
 say "Cloning the Demo App"
 
+git config --global http.postBuffer 524288000
 git clone --depth 1 https://github.com/meta-pytorch/executorch-examples.git
 
 say "Installing CoreML Backend Requirements"
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
index 5f472fad63b..a7ded52ccc6 100644
--- a/.ci/scripts/test_llama_torchao_lowbit.sh
+++ b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -31,6 +31,7 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index c3bd2f77b86..d8cb9596ffc 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -38,6 +38,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index de28597b1d5..34063a23374 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -48,22 +48,33 @@ prepare_artifacts_upload() {
   fi
 }
 
+
 build_cmake_executor_runner() {
   local backend_string_select="${1:-}"
   echo "Building executor_runner"
   rm -rf ${CMAKE_OUTPUT_DIR}
   mkdir ${CMAKE_OUTPUT_DIR}
+  # Common options:
+  COMMON="-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE"
   if [[ "$backend_string_select" == "XNNPACK" ]]; then
     echo "Backend $backend_string_select selected"
-    (cd ${CMAKE_OUTPUT_DIR} \
-      && cmake -DCMAKE_BUILD_TYPE=Release \
+    cmake -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  elif [[ "$backend_string_select" == "CUDA" ]]; then
+    echo "Backend $backend_string_select selected"
+    cmake -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_CUDA=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4
   else
     cmake -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        ${COMMON} \
         -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
   fi
@@ -320,6 +331,13 @@ test_model_with_mediatek() {
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit)
 }
 
+test_model_with_cuda() {
+  # Export a basic .pte and .ptd, then run the model.
+  "${PYTHON_EXECUTABLE}" -m examples.cuda.scripts.export --model_name="${MODEL_NAME}" --output_dir "./"
+  build_cmake_executor_runner "CUDA"
+  ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" --data_path "./aoti_cuda_blob.ptd"
+}
+
 
 if [[ "${BACKEND}" == "portable" ]]; then
   echo "Testing ${MODEL_NAME} with portable kernels..."
@@ -372,6 +390,12 @@ elif [[ "${BACKEND}" == "mediatek" ]]; then
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
+elif [[ "${BACKEND}" == "cuda" ]]; then
+  echo "Testing ${MODEL_NAME} with cuda..."
+  test_model_with_cuda
+  if [[ $? -eq 0 ]]; then
+    prepare_artifacts_upload
+  fi
 else
   set +e
   if [[ "${BACKEND}" == *"quantization"* ]]; then
diff --git a/.ci/scripts/test_openvino.sh b/.ci/scripts/test_openvino.sh
index 85884a6475b..2bb2115b1ec 100755
--- a/.ci/scripts/test_openvino.sh
+++ b/.ci/scripts/test_openvino.sh
@@ -10,7 +10,7 @@ set -ex
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-source openvino/dist/setupvars.sh
+source openvino/setupvars.sh
 cd backends/openvino/tests
 python test_runner.py --test_type ops
 python test_runner.py --test_type models
diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index f06c794f88d..da50d28800a 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -129,6 +129,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/test_yolo12.sh b/.ci/scripts/test_yolo12.sh
index e3f20d5f970..594ddbf86ed 100755
--- a/.ci/scripts/test_yolo12.sh
+++ b/.ci/scripts/test_yolo12.sh
@@ -119,6 +119,8 @@ cmake_install_executorch_libraries() {
           -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
           -B"${build_dir}"
@@ -131,6 +133,8 @@ cmake_install_executorch_libraries() {
                        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
                        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
                        -DEXECUTORCH_ENABLE_LOGGING=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index f6f6ece786b..f896d3f1d40 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -125,14 +125,15 @@ build_executorch_runner_cmake() {
   clean_executorch_install_folders
   mkdir "${CMAKE_OUTPUT_DIR}"
 
-  pushd "${CMAKE_OUTPUT_DIR}" || return
   if [[ $1 == "Debug" ]]; then
       CXXFLAGS="-fsanitize=address,undefined"
   else
       CXXFLAGS=""
   fi
-  CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
-  popd || return
+  CXXFLAGS="$CXXFLAGS" retry cmake \
+    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+    -DCMAKE_BUILD_TYPE="${1:-Release}" \
+    -B${CMAKE_OUTPUT_DIR} .
 
   if [ "$(uname)" == "Darwin" ]; then
     CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
index bec6d3a0f5e..beda0f77c83 100644
--- a/.github/workflows/android-release-artifacts.yml
+++ b/.github/workflows/android-release-artifacts.yml
@@ -90,6 +90,10 @@ jobs:
         fi
 
         FLAVOR="${{ inputs.flavor }}"
+        if [ ! -z "$FLAVOR" ]; then
+          GRADLE_ARGS+=" -Dflavor=${FLAVOR}"
+        fi
+
         if [[ "$FLAVOR" == "vulkan" || -z "$FLAVOR" ]]; then
           curl -O https://sdk.lunarg.com/sdk/download/1.4.321.1/linux/vulkansdk-linux-x86_64-1.4.321.1.tar.xz
           tar xf vulkansdk-linux-x86_64-1.4.321.1.tar.xz -C /tmp
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
new file mode 100644
index 00000000000..c1b22e692ab
--- /dev/null
+++ b/.github/workflows/cuda.yml
@@ -0,0 +1,282 @@
+# Test ExecuTorch CUDA Build Compatibility
+# This workflow tests whether ExecuTorch can be successfully built with CUDA support
+# across different CUDA versions (12.6, 12.8, 12.9) using the command:
+# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+#
+# Note: ExecuTorch automatically detects the system CUDA version using nvcc and
+# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
+
+name: Test CUDA Builds
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: false
+
+jobs:
+  test-cuda-builds:
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda-version: ["12.6", "12.8", "13.0"]
+
+    name: test-executorch-cuda-build-${{ matrix.cuda-version }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda-version }}
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        # Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version
+        # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+        source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
+
+  # This job will fail if any of the CUDA versions fail
+  check-all-cuda-builds:
+    needs: test-cuda-builds
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Check if all CUDA builds succeeded
+        run: |
+          if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then
+            echo "ERROR: One or more ExecuTorch CUDA builds failed!"
+            echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
+            exit 1
+          else
+            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
+          fi
+
+  test-models-cuda:
+    name: test-models-cuda
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        model: [linear, add, add_mul, resnet18]
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
+
+  export-voxtral-cuda-artifact:
+    name: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install mistral-common librosa
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral"
+        optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 1024 \
+            --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
+        echo "::endgroup::"
+
+        echo "::group::Store Voxtral Artifacts"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  benchmark-voxtral-cuda:
+    name: benchmark-voxtral-cuda
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Benchmark"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
+        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
+        curl -L $TOKENIZER_URL -o tekken.json
+        ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
+        echo "::endgroup::"
+
+        echo "::group::Download Test Audio File"
+        AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        curl -L $AUDIO_URL -o poem.wav
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/voxtral \
+              -Bcmake-out/examples/models/voxtral/
+        cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Runner"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tekken.json \
+              --audio_path poem.wav \
+              --processor_path voxtral_preprocessor.pte \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "poem"; then
+          echo "Expected output 'poem' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c15fadd102f..5b646cba9d1 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -787,7 +787,6 @@ jobs:
       contents: read
     strategy:
       fail-fast: false
-    if: false # TODO Re-enable after fixing timeouts (#14314)
     with:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-gcc9
@@ -910,12 +909,12 @@ jobs:
       contents: read
     secrets: inherit
     with:
+      secrets-env: SAMSUNG_AI_LITECORE_KEY
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12-android
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
-      secrets-env: SAMSUNG_AI_LITECORE_KEY
       script: |
         set -ex
 
@@ -936,6 +935,12 @@ jobs:
           python -m executorch.examples.samsung.aot_compiler --model_name=$model -c E9955
         done
 
+        # Test quant models
+        model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
+        for m_script in $model_scripts; do
+          python -m executorch.examples.samsung.scripts.${m_script} -c e9955 -p A8W8
+        done
+
         # Test ops
         python -m unittest discover -s backends/samsung/test/ops -p "test_*.py"
 
@@ -970,11 +975,16 @@ jobs:
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build
 
         # Test models serially
-        models="mv2 mv3 edsr resnet18 resnet50 dl3"
+        models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4"
         for model in $models; do
           python -m examples.vulkan.export --model_name=$model --test
         done
 
+        # For selected vision models, test with dynamic shapes
+        models="mv2 resnet18 resnet50 ic3 densenet161"
+        for model in $models; do
+          python -m examples.vulkan.export --model_name=$model --test -d
+        done
 
   test-vulkan-operators-linux:
     name: test-vulkan-operators-linux
diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml
index bee74fee172..22e3d524f6b 100644
--- a/.github/workflows/test-backend-arm.yml
+++ b/.github/workflows/test-backend-arm.yml
@@ -12,6 +12,9 @@ on:
     paths:
       - .github/workflows/test-backend-arm.yml
       - .github/workflows/_test_backend.yml
+      - .ci/scripts/test_backend.sh
+      - backends/test/suite/flow.py
+      - backends/test/suite/flows/arm.py
   workflow_dispatch:
 
 concurrency:
@@ -23,7 +26,7 @@ jobs:
     uses: ./.github/workflows/_test_backend.yml
     with:
       backend: arm
-      flows: '["arm_tosa"]'
+      flows: '["arm_tosa_fp", "arm_tosa_int", "arm_ethos_u55", "arm_ethos_u85"]'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 120
       run-linux: true
diff --git a/.github/workflows/test-cuda-builds.yml b/.github/workflows/test-cuda-builds.yml
deleted file mode 100644
index 5e054c1de84..00000000000
--- a/.github/workflows/test-cuda-builds.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-# Test ExecuTorch CUDA Build Compatibility
-# This workflow tests whether ExecuTorch can be successfully built with CUDA support
-# across different CUDA versions (12.6, 12.8, 12.9) using the command:
-# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
-#
-# Note: ExecuTorch automatically detects the system CUDA version using nvcc and
-# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
-
-name: Test CUDA Builds
-
-on:
-  pull_request:
-  push:
-    branches:
-      - main
-      - release/*
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: false
-
-jobs:
-  test-cuda-builds:
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda-version: ["12.6", "12.8", "13.0"]
-
-    name: test-executorch-cuda-build-${{ matrix.cuda-version }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      timeout: 90
-      runner: linux.g5.4xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: ${{ matrix.cuda-version }}
-      use-custom-docker-registry: false
-      submodules: recursive
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      script: |
-        set -eux
-
-        # Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version
-        # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
-        source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
-
-  # This job will fail if any of the CUDA versions fail
-  check-all-cuda-builds:
-    needs: test-cuda-builds
-    runs-on: ubuntu-latest
-    if: always()
-    steps:
-      - name: Check if all CUDA builds succeeded
-        run: |
-          if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then
-            echo "ERROR: One or more ExecuTorch CUDA builds failed!"
-            echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
-            exit 1
-          else
-            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
-          fi
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index ae3001ca920..8add54af49c 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -289,6 +289,7 @@ jobs:
           - test_arm_baremetal: test_models_ethos-u55
           - test_arm_baremetal: test_models_ethos-u85
           - test_arm_baremetal: test_smaller_stories_llama
+          - test_arm_baremetal: test_memory_allocation
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
@@ -345,7 +346,7 @@ jobs:
         elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
           setup_script_args="--target-toolchain zephyr"
           toolchain_prefix=arm-zephyr-eabi-
-          threshold="135168" # 132 KiB
+          threshold="135240" # 132 KiB
           toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
         else
           echo "Fail unsupport OS selection ${{ matrix.os }}"
@@ -823,11 +824,26 @@ jobs:
         echo "Recipe: $RECIPE"
         echo "Quantize: $QUANTIZE"
 
-        echo "::group::Set up ExecuTorch"
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
+        echo "::group::Setup ExecuTorch"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
+
+        echo "::group::Test MODEL: $MODEL RECIPE: $RECIPE QUANTIZE: $QUANTIZE"
+        export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
+        python .ci/scripts/test_huggingface_optimum_model.py --model "$MODEL" --recipe "$RECIPE" $QUANTIZE --model_dir "$OUTPUT_DIR"
+        echo "::endgroup::"
+
         # Build executor_runner with ETdump enabled
         PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
@@ -836,6 +852,7 @@ jobs:
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
           -DEXECUTORCH_BUILD_XNNPACK=ON \
           -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
@@ -845,25 +862,6 @@ jobs:
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -Bcmake-out .
         cmake --build cmake-out -j16 --target install --config Release
-        echo "::endgroup::"
-
-        echo "::group::Set up Hugging Face"
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        git clone https://github.com/huggingface/optimum-executorch
-        pushd optimum-executorch
-        # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout $OPTIMUM_ET_COMMIT
-        python install_dev.py --skip_override_torch
-        popd
-        pip list
-        echo "::endgroup::"
-
-        echo "::group::Run tests"
-        export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
-        python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR}
-        echo "::endgroup::"
 
         echo "::group::Generate artifacts for performance profiling"
         ./cmake-out/executor_runner \
@@ -930,16 +928,11 @@ jobs:
         ${CONDA_RUN} python install_executorch.py
         echo "::endgroup::"
 
-        echo "::group::Set up Hugging Face"
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        git clone https://github.com/huggingface/optimum-executorch
-        pushd optimum-executorch
-        # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout $OPTIMUM_ET_COMMIT
-        ${CONDA_RUN} python install_dev.py --skip_override_torch
-        popd
+        echo "::group::Set up Huggingface"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         ${CONDA_RUN} pip list
         echo "::endgroup::"
 
diff --git a/.lintrunner.toml b/.lintrunner.toml
index ef771bdb9df..b366c141799 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -219,6 +219,7 @@ exclude_patterns = [
     '**/*.gif',
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
+    'examples/cuda',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7012ec641bf..10e2eb437e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -226,7 +226,7 @@ if(EXECUTORCH_BUILD_CPUINFO)
   install(
     TARGETS cpuinfo
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -266,10 +266,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   executorch_move_interface_include_directories_to_build_time_only(
     pthreadpool_interface
   )
+
+  if(APPLE)
+    # Use hidden visibility for pthreadpool on Apple platforms to avoid issues
+    # with pthreadpool symbols from libtorch_cpu taking precedence over the ones
+    # from the pthreadpool library statically linked in _portable_lib. The
+    # pthreadpool public APIs are marked as weak by default on some Apple
+    # platforms, so setting to hidden visibility works around this by not
+    # putting the symbol in the indirection table. See
+    # https://github.com/pytorch/executorch/issues/14321 for more details.
+    target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
+  endif()
+
   install(
     TARGETS pthreadpool pthreadpool_interface fxdiv
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -284,7 +296,10 @@ if(EXECUTORCH_BUILD_TESTS)
 endif()
 
 # TODO(dbort): Fix these warnings and remove this flag.
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 
 # Let files say "include <executorch/path/to/header.h>".
 # TODO(#6475): This requires/assumes that the repo lives in a directory named
@@ -587,6 +602,16 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  list(APPEND _executorch_backends coretex_m_backend)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
@@ -630,6 +655,11 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   list(APPEND _executorch_extensions extension_module_static)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/named_data_map)
+  list(APPEND _executorch_extensions extension_named_data_map)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
   if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
     set(SUPPORT_REGEX_LOOKAHEAD ON)
@@ -708,7 +738,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
   install(
     TARGETS torchao_ops_executorch torchao_kernels_aarch64
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -719,7 +749,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
     install(
       TARGETS kleidiai
       EXPORT ExecuTorchTargets
-      DESTINATION lib
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}
       INCLUDES
       DESTINATION ${_common_include_directories}
     )
@@ -760,7 +790,10 @@ if(EXECUTORCH_BUILD_PYBIND)
       bundled_module PUBLIC ${_common_include_directories}
     )
     target_compile_options(
-      bundled_module PUBLIC -Wno-deprecated-declarations -fPIC
+      bundled_module
+      PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+             -fPIC>
     )
   endif()
 
@@ -832,8 +865,14 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # compile options for pybind
-  set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
-                              -fexceptions
+  set(_pybind_compile_options
+      $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+      /GR
+      /wd4996>
+      $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+      -fPIC
+      -frtti
+      -fexceptions>
   )
 
   # util lib
@@ -999,7 +1038,7 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
   install(
     TARGETS executorch_selected_kernels
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 else()
   # No selective build - link the full library.
@@ -1021,6 +1060,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/CODEOWNERS b/CODEOWNERS
index 10baed9ede4..11f3ca07615 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -14,6 +14,7 @@
 /backends/transforms @kimishpatel
 /backends/vulkan @SS-JIA
 /backends/xnnpack @digantdesai @mcr229
+/backends/nxp @robert-kalmar
 
 /devtools @Gasoonjia
 
@@ -33,6 +34,7 @@
 /examples/qualcomm @cccclai
 /examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka
 /examples/xnnpack @digantdesai @mcr229
+/examples/nxp @robert-kalmar
 
 /exir/backend @cccclai @kimishpatel @JacobSzwejbka
 /exir @JacobSzwejbka @larryliu0820
diff --git a/README.md b/README.md
index 17327990a1d..531fcc3b4ef 100644
--- a/README.md
+++ b/README.md
@@ -1,72 +1,250 @@
 <div align="center">
-  <img src="docs/source/_static/img/et-logo.png" alt="Logo" width="200">
-  <h1 align="center">ExecuTorch: A powerful on-device AI Framework</h1>
+  <img src="docs/source/_static/img/et-logo.png" alt="ExecuTorch logo mark" width="200">
+  <h1>ExecuTorch</h1>
+  <p><strong>On-device AI inference powered by PyTorch</strong></p>
 </div>
 
-
 <div align="center">
-  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
-  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
-  <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-  <a href="https://pytorch.org/executorch/main/index"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
-  <hr>
+  <a href="https://pypi.org/project/executorch/"><img src="https://img.shields.io/pypi/v/executorch?style=for-the-badge&color=blue" alt="PyPI - Version"></a>
+  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="GitHub - Contributors"></a>
+  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="GitHub - Stars"></a>
+  <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-blue?logo=discord&logoColor=white&style=for-the-badge" alt="Discord - Chat with Us"></a>
+  <a href="https://docs.pytorch.org/executorch/main/index.html"><img src="https://img.shields.io/badge/Documentation-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Documentation"></a>
 </div>
 
-**ExecuTorch** is an end-to-end solution for on-device inference and training. It powers much of Meta's on-device AI experiences across Facebook, Instagram, Meta Quest, Ray-Ban Meta Smart Glasses, WhatsApp, and more.
+**ExecuTorch** is PyTorch's unified solution for deploying AI models on-device—from smartphones to microcontrollers—built for privacy, performance, and portability. It powers Meta's on-device AI across **Instagram, WhatsApp, Quest 3, Ray-Ban Meta Smart Glasses**, and [more](https://docs.pytorch.org/executorch/main/success-stories.html).
+
+Deploy **LLMs, vision, speech, and multimodal models** with the same PyTorch APIs you already know—accelerating research to production with seamless model export, optimization, and deployment. No manual C++ rewrites. No format conversions. No vendor lock-in.
+
+<details>
+  <summary><strong>📘 Table of Contents</strong></summary>
+
+- [Why ExecuTorch?](#why-executorch)
+- [How It Works](#how-it-works)
+- [Quick Start](#quick-start)
+  - [Installation](#installation)
+  - [Export and Deploy in 3 Steps](#export-and-deploy-in-3-steps)
+  - [Run on Device](#run-on-device)
+  - [LLM Example: Llama](#llm-example-llama)
+- [Platform & Hardware Support](#platform--hardware-support)
+- [Production Deployments](#production-deployments)
+- [Examples & Models](#examples--models)
+- [Key Features](#key-features)
+- [Documentation](#documentation)
+- [Community & Contributing](#community--contributing)
+- [License](#license)
+
+</details>
+
+## Why ExecuTorch?
+
+- **🔒 Native PyTorch Export** — Direct export from PyTorch. No .onnx, .tflite, or intermediate format conversions. Preserve model semantics.
+- **⚡ Production-Proven** — Powers billions of users at [Meta with real-time on-device inference](https://engineering.fb.com/2025/07/28/android/executorch-on-device-ml-meta-family-of-apps/).
+- **💾 Tiny Runtime** — 50KB base footprint. Runs on microcontrollers to high-end smartphones.
+- **🚀 [12+ Hardware Backends](https://docs.pytorch.org/executorch/main/backends-overview.html)** — Open-source acceleration for Apple, Qualcomm, ARM, MediaTek, Vulkan, and more.
+- **🎯 One Export, Multiple Backends** — Switch hardware targets with a single line change. Deploy the same model everywhere.
+
+## How It Works
+
+ExecuTorch uses **ahead-of-time (AOT) compilation** to prepare PyTorch models for edge deployment:
+
+1. **🧩 Export** — Capture your PyTorch model graph with `torch.export()`
+2. **⚙️ Compile** — Quantize, optimize, and partition to hardware backends → `.pte`
+3. **🚀 Execute** — Load `.pte` on-device via lightweight C++ runtime
+
+Models use a standardized [Core ATen operator set](https://docs.pytorch.org/executorch/main/compiler-ir-advanced.html#intermediate-representation). [Partitioners](https://docs.pytorch.org/executorch/main/compiler-delegate-and-partitioner.html) delegate subgraphs to specialized hardware (NPU/GPU) with CPU fallback.
+
+Learn more: [How ExecuTorch Works](https://docs.pytorch.org/executorch/main/intro-how-it-works.html) • [Architecture Guide](https://docs.pytorch.org/executorch/main/getting-started-architecture.html)
+
+## Quick Start
+
+### Installation
+
+```bash
+pip install executorch
+```
+
+For platform-specific setup (Android, iOS, embedded systems), see the [Quick Start](https://docs.pytorch.org/executorch/main/quick-start-section.html) documentation for additional info.
+
+### Export and Deploy in 3 Steps
+
+```python
+import torch
+from executorch.exir import to_edge_transform_and_lower
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# 1. Export your PyTorch model
+model = MyModel().eval()
+example_inputs = (torch.randn(1, 3, 224, 224),)
+exported_program = torch.export.export(model, example_inputs)
+
+# 2. Optimize for target hardware (switch backends with one line)
+program = to_edge_transform_and_lower(
+    exported_program,
+    partitioner=[XnnpackPartitioner()]  # CPU | CoreMLPartitioner() for iOS | QnnPartitioner() for Qualcomm
+).to_executorch()
+
+# 3. Save for deployment
+with open("model.pte", "wb") as f:
+    f.write(program.buffer)
+
+# Test locally via ExecuTorch runtime's pybind API (optional)
+from executorch.runtime import Runtime
+runtime = Runtime.get()
+method = runtime.load_program("model.pte").load_method("forward")
+outputs = method.execute([torch.randn(1, 3, 224, 224)])
+```
+
+### Run on Device
+
+**[C++](https://docs.pytorch.org/executorch/main/using-executorch-cpp.html)**
+```cpp
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+Module module("model.pte");
+auto tensor = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
+auto outputs = module.forward({tensor});
+```
+
+**[Swift (iOS)](https://docs.pytorch.org/executorch/main/ios-section.html)**
+```swift
+let module = Module(filePath: "model.pte")
+let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0])
+let outputs: [Value] = try module.forward([input])
+```
+
+**[Kotlin (Android)](https://docs.pytorch.org/executorch/main/android-section.html)**
+```kotlin
+val module = Module.load("model.pte")
+val inputTensor = Tensor.fromBlob(floatArrayOf(1.0f, 2.0f, 3.0f, 4.0f), longArrayOf(2, 2))
+val outputs = module.forward(EValue.from(inputTensor))
+```
+
+### LLM Example: Llama
+
+Export Llama models using the [`export_llm`](https://docs.pytorch.org/executorch/main/llm/export-llm.html) script or [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch):
+
+```bash
+# Using export_llm
+python -m executorch.extension.llm.export.export_llm --model llama3_2 --output llama.pte
+
+# Using Optimum-ExecuTorch
+optimum-cli export executorch \
+  --model meta-llama/Llama-3.2-1B \
+  --task text-generation \
+  --recipe xnnpack \
+  --output_dir llama_model
+```
 
-It supports a wide range of models including LLMs (Large Language Models), CV (Computer Vision), ASR (Automatic Speech Recognition), and TTS (Text to Speech).
+Run on-device with the LLM runner API:
 
-Platform Support:
-- Operating Systems:
-  - iOS
-  - MacOS (ARM64)
-  - Android
-  - Linux
-  - Microcontrollers
+**[C++](https://docs.pytorch.org/executorch/main/llm/run-with-c-plus-plus.html)**
+```cpp
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 
-- Hardware Acceleration:
-  - Apple
-  - Arm
-  - Cadence
-  - MediaTek
-  - NXP
-  - OpenVINO
-  - Qualcomm
-  - Vulkan
-  - XNNPACK
+auto runner = create_llama_runner("llama.pte", "tiktoken.bin");
+executorch::extension::llm::GenerationConfig config{
+    .seq_len = 128, .temperature = 0.8f};
+runner->generate("Hello, how are you?", config);
+```
 
-Key value propositions of ExecuTorch are:
+**[Swift (iOS)](https://docs.pytorch.org/executorch/main/llm/run-on-ios.html)**
+```swift
+let runner = TextRunner(modelPath: "llama.pte", tokenizerPath: "tiktoken.bin")
+try runner.generate("Hello, how are you?", Config {
+    $0.sequenceLength = 128
+}) { token in
+    print(token, terminator: "")
+}
+```
 
-- **Portability:** Compatibility with a wide variety of computing platforms,
-  from high-end mobile phones to highly constrained embedded systems and
-  microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and Developer
-  Tools from PyTorch model authoring and conversion, to debugging and deployment
-  to a wide variety of platforms.
-- **Performance:** Providing end users with a seamless and high-performance
-  experience due to a lightweight runtime and utilizing full hardware
-  capabilities such as CPUs, NPUs, and DSPs.
+**Kotlin (Android)** — [API Docs](https://docs.pytorch.org/executorch/main/javadoc/org/pytorch/executorch/extension/llm/package-summary.html) • [Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android/LlamaDemo)
+```kotlin
+val llmModule = LlmModule("llama.pte", "tiktoken.bin", 0.8f)
+llmModule.load()
+llmModule.generate("Hello, how are you?", 128, object : LlmCallback {
+    override fun onResult(result: String) { print(result) }
+    override fun onStats(stats: String) { }
+})
+```
 
-## Getting Started
-To get started you can:
+For multimodal models (vision, audio), use the [MultiModal runner API](extension/llm/runner) which extends the LLM runner to handle image and audio inputs alongside text. See [Llava](examples/models/llava/README.md) and [Voxtral](examples/models/voxtral/README.md) examples.
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
-- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
-- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [Llava](examples/models/llava/README.md), [Voxtral](examples/models/voxtral/README.md), and [LFM2](examples/models/lfm2/README.md).
+See [examples/models/llama](examples/models/llama/README.md) for complete workflow including quantization, mobile deployment, and advanced options.
 
-## Feedback and Engagement
+**Next Steps:**
+- 📖 [Step-by-step tutorial](https://docs.pytorch.org/executorch/main/getting-started.html) — Complete walkthrough for your first model
+- ⚡ [Colab notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) — Try ExecuTorch instantly in your browser
+- 🤖 [Deploy Llama models](examples/models/llama/README.md) — LLM workflow with quantization and mobile demos
 
-We welcome any feedback, suggestions, and bug reports from the community to help
-us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/Dh43CKSAdc)
+## Platform & Hardware Support
 
-## Contributing
+| **Platform**     | **Supported Backends**                                   |
+|------------------|----------------------------------------------------------|
+| Android          | XNNPACK, Vulkan, Qualcomm, MediaTek, Samsung Exynos      |
+| iOS              | XNNPACK, MPS, CoreML (Neural Engine)                     |
+| Linux / Windows  | XNNPACK, OpenVINO, CUDA *(experimental)*                 |
+| macOS            | XNNPACK, MPS, Metal *(experimental)*                     |
+| Embedded / MCU   | XNNPACK, ARM Ethos-U, NXP, Cadence DSP                   |
 
-We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/Dh43CKSAdc)
+See [Backend Documentation](https://docs.pytorch.org/executorch/main/backends-overview.html) for detailed hardware requirements and optimization guides.
 
+## Production Deployments
 
-## Directory Structure
+ExecuTorch powers on-device AI at scale across Meta's family of apps, VR/AR devices, and partner deployments. [View success stories →](https://docs.pytorch.org/executorch/main/success-stories.html)
 
-Please refer to the [Codebase structure](CONTRIBUTING.md#codebase-structure) section of the [Contributing Guidelines](CONTRIBUTING.md) for more details.
+## Examples & Models
+
+**LLMs:** [Llama 3.2/3.1/3](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [LiquidAI LFM2](examples/models/lfm2/README.md)
+
+**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language)
+
+**Vision/Speech:** [MobileNetV2](https://github.com/meta-pytorch/executorch-examples/tree/main/mv2), [DeepLabV3](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3)
+
+**Resources:** [`examples/`](examples/) directory • [executorch-examples](https://github.com/meta-pytorch/executorch-examples) mobile demos • [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch) for HuggingFace models
+
+## Key Features
+
+ExecuTorch provides advanced capabilities for production deployment:
+
+- **Quantization** — Built-in support via [torchao](https://docs.pytorch.org/ao) for 8-bit, 4-bit, and dynamic quantization
+- **Memory Planning** — Optimize memory usage with ahead-of-time allocation strategies
+- **Developer Tools** — ETDump profiler, ETRecord inspector, and model debugger
+- **Selective Build** — Strip unused operators to minimize binary size
+- **Custom Operators** — Extend with domain-specific kernels
+- **Dynamic Shapes** — Support variable input sizes with bounded ranges
+
+See [Advanced Topics](https://docs.pytorch.org/executorch/main/advanced-topics-section.html) for quantization techniques, custom backends, and compiler passes.
+
+## Documentation
+
+- [**Documentation Home**](https://docs.pytorch.org/executorch/main/index.html) — Complete guides and tutorials
+- [**API Reference**](https://docs.pytorch.org/executorch/main/api-section.html) — Python, C++, Java/Kotlin APIs
+- [**Backend Integration**](https://docs.pytorch.org/executorch/main/backend-delegates-integration.html) — Build custom hardware backends
+- [**Troubleshooting**](https://docs.pytorch.org/executorch/main/using-executorch-troubleshooting.html) — Common issues and solutions
+
+## Community & Contributing
+
+We welcome contributions from the community!
+
+- 💬 [**GitHub Discussions**](https://github.com/pytorch/executorch/discussions) — Ask questions and share ideas
+- 🎮 [**Discord**](https://discord.gg/Dh43CKSAdc) — Chat with the team and community
+- 🐛 [**Issues**](https://github.com/pytorch/executorch/issues) — Report bugs or request features
+- 🤝 [**Contributing Guide**](CONTRIBUTING.md) — Guidelines and codebase structure
 
 ## License
-ExecuTorch is BSD licensed, as found in the LICENSE file.
+
+ExecuTorch is BSD licensed, as found in the [LICENSE](LICENSE) file.
+
+<br><br>
+
+---
+
+<div align="center">
+  <p><strong>Part of the PyTorch ecosystem</strong></p>
+  <p>
+    <a href="https://github.com/pytorch/executorch">GitHub</a> •
+    <a href="https://docs.pytorch.org/executorch">Documentation</a>
+  </p>
+</div>
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 2aa8a5692ac..fcabb0a3f2b 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -30,25 +30,28 @@ set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp)
 add_library(aoti_common STATIC ${_aoti_common_sources})
 target_include_directories(
   aoti_common
-  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
          # PyTorch AOTI headers from ExecuTorch's torch detection
          ${TORCH_INCLUDE_DIRS}
 )
-target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
-# Ensure symbols are exported properly
-target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
-
-# Link against PyTorch libraries and standard libraries
-target_link_libraries(
+target_compile_options(
   aoti_common
-  PUBLIC extension_tensor ${CMAKE_DL_LIBS}
-         # Link PyTorch libraries for AOTI functions
-         ${TORCH_LIBRARIES}
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
+# Ensure symbols are exported properly
+target_link_options(
+  aoti_common PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+)
+
+# Link against ExecuTorch libraries and standard libraries
+target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
 executorch_target_link_options_shared_lib(aoti_common)
 
 install(
   TARGETS aoti_common
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/aoti/aoti_model_container.cpp b/backends/aoti/aoti_model_container.cpp
index 03be835a0c3..46a246faeb8 100644
--- a/backends/aoti/aoti_model_container.cpp
+++ b/backends/aoti/aoti_model_container.cpp
@@ -25,6 +25,13 @@ AOTInductorModelContainerGetNumOutputsFunc
     AOTInductorModelContainerGetNumOutputs = nullptr;
 AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;
 
+// Additional global function pointers for AOT Inductor model container
+// operations needed by Metal backend
+AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName = nullptr;
+AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants = nullptr;
+
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index 4b20aefc976..877f019c457 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -21,6 +21,7 @@ using executorch::runtime::etensor::Tensor;
 extern "C" {
 
 // Type definitions
+using AOTITensorHandle = Tensor*;
 using AOTIRuntimeError = Error;
 
 // Forward declarations for AOT Inductor model container
@@ -69,12 +70,35 @@ extern AOTInductorModelContainerGetNumOutputsFunc
     AOTInductorModelContainerGetNumOutputs;
 extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
 
+// Retrieves the name of an input tensor by index from the AOTI model container.
+// Needed by Metal backend
+using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** input_name);
+
+// Retrieves the number of constants from the AOTI model container.
+// Needed by Metal backend
+using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+// Global function pointers (will be loaded dynamically).
+// Needed by Metal backend
+extern AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName;
+extern AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants;
+
 } // extern "C"
 
 // AOTI Delegate Handle structure
 struct AOTIDelegateHandle {
   void* so_handle;
+  std::string so_path;
   AOTInductorModelContainerHandle container_handle;
+  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
+                     // dependency
 };
 
 } // namespace aoti
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index 2f9b36e3c4f..1afd137aa26 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -51,13 +51,32 @@ AOTITorchError aoti_torch_get_storage_offset(
 
 AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
   auto it = internal::tensor_to_strides.find(tensor);
+  bool needs_update = false;
+
   if (it == internal::tensor_to_strides.end()) {
+    needs_update = true;
+  } else {
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
+    auto tensor_strides = tensor->strides();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_strides.begin(),
+        tensor_strides.end());
+  }
+
+  if (needs_update) {
     std::vector<int64_t> strides(tensor->dim());
     auto tensor_strides = tensor->strides();
     for (int i = 0; i < tensor->dim(); i++) {
       strides[i] = tensor_strides[i];
     }
-    it = internal::tensor_to_strides.emplace(tensor, std::move(strides)).first;
+    it =
+        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
+            .first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -80,13 +99,31 @@ AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
 
 AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
   auto it = internal::tensor_to_sizes.find(tensor);
+  bool needs_update = false;
+
   if (it == internal::tensor_to_sizes.end()) {
+    needs_update = true;
+  } else {
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
+    auto tensor_sizes = tensor->sizes();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_sizes.begin(),
+        tensor_sizes.end());
+  }
+
+  if (needs_update) {
     std::vector<int64_t> sizes(tensor->dim());
     auto tensor_sizes = tensor->sizes();
     for (int i = 0; i < tensor->dim(); i++) {
       sizes[i] = tensor_sizes[i];
     }
-    it = internal::tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
+    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
+             .first;
   }
 
   // For 0D tensors, data() returns nullptr on empty vectors, but we need to
@@ -127,11 +164,24 @@ int32_t aoti_torch_layout_strided() {
 }
 
 // Dtype constants - these return the PyTorch dtype codes
-// Currently only float32 is supported, but using robust enum-based approach
 int32_t aoti_torch_dtype_float32() {
   return 6; // PyTorch's float32 dtype code
 }
 
+int32_t aoti_torch_dtype_bfloat16() {
+  return 15; // PyTorch's bfloat16 dtype code
+}
+
+int32_t aoti_torch_dtype_int64() {
+  return 4; // PyTorch's int64 dtype code
+}
+
+// Dtype utility function needed by Metal backend.
+// Returns the size of the dtype in bytes.
+size_t aoti_torch_dtype_element_size(int32_t dtype) {
+  return dtype_to_element_size(dtype);
+}
+
 // Cleanup functions
 void cleanup_tensor_metadata() {
   internal::tensor_to_sizes.clear();
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index ffcbaa11a08..b79e4c86715 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -58,6 +58,11 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
+int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int64();
+
+// Dtype utility function needed by Metal backend
+size_t aoti_torch_dtype_element_size(int32_t dtype);
 
 // Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled();
diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
index 79f082e5a89..8bf44573bb3 100644
--- a/backends/aoti/targets.bzl
+++ b/backends/aoti/targets.bzl
@@ -51,7 +51,7 @@ def define_common_targets():
         link_whole = True,
         supports_python_dlopen = True,
         visibility = ["@EXECUTORCH_CLIENTS"],
-        deps = [
+        exported_deps = [
             ":common_shims",
             ":model_container",
         ],
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
index 1c872e08648..78c07bcea6e 100644
--- a/backends/aoti/utils.h
+++ b/backends/aoti/utils.h
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 4: // PyTorch's int64 dtype code
+      return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 15: // PyTorch's bfloat16 dtype code
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index 9879a05e3dc..17e2d94e336 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -115,7 +115,7 @@ if(APPLE)
 endif()
 target_compile_options(coreml_util PUBLIC -fPIC)
 
-install(TARGETS coreml_util DESTINATION lib)
+install(TARGETS coreml_util DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 install(
   DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
@@ -154,7 +154,7 @@ target_compile_options(coreml_inmemoryfs PUBLIC -fPIC)
 
 install(
   TARGETS coreml_inmemoryfs
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
@@ -251,7 +251,7 @@ if(APPLE)
   install(
     TARGETS coremldelegate coreml_util coreml_inmemoryfs
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
   )
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index 5a253347b01..99a8afa16ac 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -77,7 +77,7 @@ target_compile_options(mpsdelegate PRIVATE "-fno-objc-arc")
 install(
   TARGETS mpsdelegate mps_schema
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/backends/arm/README.md b/backends/arm/README.md
index e495a8e40cb..0abf5e9bf55 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -6,7 +6,7 @@ PyTorch models to a TOSA representation. This representation is used to
 deploy to the following targets:
 
 - **Arm&reg; Ethos&trade;-U55/65/85** - Compiled using the Ethos-U Vela compiler.
-- **VGF (Vulkan&reg; Graph Format)** – SPIR-V™ representation for Vulkan-capable devices.
+- **VGF Format, for ML extensions for Vulkan®** – a format containing SPIR-V™ ML operators for Vulkan-capable devices.
 
 The backend provides an ahead-of-time (AOT) flow, that produces a PTE file for your
 chosen target. The AOT flow supports the following development operating systems:
diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
index a78ab252739..a737c4bc9de 100644
--- a/backends/arm/TARGETS
+++ b/backends/arm/TARGETS
@@ -106,3 +106,17 @@ runtime.python_library(
         "//caffe2:torch",
     ]
 )
+runtime.python_library(
+    name = "_factory",
+    srcs = [
+        "util/_factory.py"
+    ],
+    deps = [
+        ":ethosu",
+        ":vgf",
+        ":arm_compile_spec",
+        "//executorch/backends/arm/quantizer:lib",
+        "//executorch/exir/backend:operator_support",
+        "//executorch/exir/backend:compile_spec_schema",
+    ]
+)
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index 93bf20e69c1..b1337c38a58 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -27,6 +27,7 @@
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_acosh_pass import DecomposeAcoshPass  # noqa
 from .decompose_adaptive_avg_pool2d_pass import DecomposeAdaptiveAvgPool2dPass  # noqa
+from .decompose_add_sub_alpha_pass import DecomposeAddSubAlphaPass  # noqa
 from .decompose_addmm_pass import DecomposeAddmmPass  # noqa
 from .decompose_asin_and_acos_pass import DecomposeAsinAndAcosPass  # noqa
 from .decompose_asinh_pass import DecomposeAsinhPass  # noqa
@@ -81,7 +82,7 @@
 from .insert_int32_casts_after_int64_placeholders import (  # noqa
     InsertInt32CastsAfterInt64PlaceholdersPass,
 )
-from .insert_rescales_pass import InsertRescalePass  # noqa
+from .insert_rescales_pass import InsertRescaleInt32Pass, InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
 from .match_arg_dtype_pass import MatchArgDtypePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
@@ -91,6 +92,7 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
 )
+from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
diff --git a/backends/arm/_passes/_debug_passes.py b/backends/arm/_passes/_debug_passes.py
index 7809885d465..4c1661e50a9 100644
--- a/backends/arm/_passes/_debug_passes.py
+++ b/backends/arm/_passes/_debug_passes.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.devtools.visualization.visualization_utils import visualize_graph
 from executorch.exir import ExportedProgram
@@ -14,6 +16,8 @@ class VisualizePass(ExportPass):
     This pass visualizes the graph at the point of insertion in the pass manager
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
         self.exported_program = exported_program
diff --git a/backends/arm/_passes/annotate_decomposed_matmul.py b/backends/arm/_passes/annotate_decomposed_matmul.py
index 6b89b0c3c4a..72ae46c76c1 100644
--- a/backends/arm/_passes/annotate_decomposed_matmul.py
+++ b/backends/arm/_passes/annotate_decomposed_matmul.py
@@ -73,7 +73,10 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 node for node in partition.nodes if node.target in matmul_targets
             ][0]
 
-            if quantized_input:
+            if quantized_input and not all(
+                input_node.target in DQ_OPS
+                for input_node in matmul_node.all_input_nodes
+            ):
                 matmul_args = matmul_node.all_input_nodes
                 for node in matmul_args:
                     # Find the dq-node connected to this mm/bmm arg
@@ -99,7 +102,9 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
             partition_output = list(partition.output_nodes[0].users)[0]
             quantized_output = partition_output.target in Q_OPS
-            if quantized_output:
+            if quantized_output and not all(
+                user.target in Q_OPS for user in matmul_node.users
+            ):
                 with graph_module.graph.inserting_after(matmul_node):
                     # Create q-node after matmul
                     q_node = create_node(
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index b7c511bbe0b..325f667f0ac 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -36,6 +36,7 @@
     DecomposeAcoshPass,
     DecomposeAdaptiveAvgPool2dPass,
     DecomposeAddmmPass,
+    DecomposeAddSubAlphaPass,
     DecomposeAsinAndAcosPass,
     DecomposeAsinhPass,
     DecomposeAtanhPass,
@@ -81,6 +82,7 @@
     FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
     InsertInt32CastsAfterInt64PlaceholdersPass,
+    InsertRescaleInt32Pass,
     InsertRescalePass,
     InsertTableOpsPass,
     MatchArgDtypePass,
@@ -91,6 +93,7 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
+    RewriteMatmulPass,
     RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
@@ -210,10 +213,12 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
 
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
         self.add_pass(InsertRescalePass())
+        self.add_pass(InsertRescaleInt32Pass())
 
         self.validate_constraints_mandatory()
         return self._transform(exported_program.graph_module)
@@ -258,6 +263,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         )
         self.add_pass(DecomposeNotEqualPass())
         self.add_pass(DecomposeDivPass())
+        self.add_pass(DecomposeAddSubAlphaPass())
         self.add_pass(DecomposeSoftmaxPass())
         self.add_pass(DecomposeGeluPass())
         self.add_pass(ConvertFullLikeToFullPass())
@@ -295,6 +301,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
         self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
@@ -329,6 +336,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeSignPass())
         self.add_pass(DecomposeAddmmPass())
         self.add_pass(DecomposeDivTensorModePass())
+        self.add_pass(DecomposeAddSubAlphaPass())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeGroupNormPass())
diff --git a/backends/arm/_passes/convert_minmax_pass.py b/backends/arm/_passes/convert_minmax_pass.py
index f1c81dbc41e..79bb6e2db0c 100644
--- a/backends/arm/_passes/convert_minmax_pass.py
+++ b/backends/arm/_passes/convert_minmax_pass.py
@@ -3,9 +3,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Set, Type
+from typing import cast, Set, Type
 
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm._passes.convert_squeezes_to_view import (
     ConvertSqueezesToViewPass,
 )
@@ -101,20 +102,28 @@ def call(self, graph_module: torch.fx.GraphModule):
             replace_node, op, squeeze_op = self.get_variables(node)
 
             # Unwrap args
-            if len(node.args) == 2:
+            if len(node.args) == 1:
+                # If dims is unspecified, min/max over all dims.
+                input_node = cast(torch.fx.Node, node.args[0])
+                input_shape = get_first_fake_tensor(input_node).shape
+                dims = range(len(input_shape))
+                keepdims = False
+            elif len(node.args) == 2:
                 input_node, dims = node.args
                 keepdims = False
             elif len(node.args) == 3:
                 input_node, dims, keepdims = node.args
             else:
-                raise RuntimeError(f"Unexpected arg size in {node.name}")
+                raise RuntimeError(
+                    f"Unexpected arg size {len(node.args)} in {node.name}"
+                )
 
             try:
-                iter(dims)
-            except:
-                dims = [dims]
+                iter(dims)  # type:ignore[assignment]
+            except Exception:
+                dims = [dims]  # type:ignore[assignment]
             else:
-                dims = list(dims)
+                dims = list(dims)  # type:ignore[assignment]
 
             # Unroll multi-dimensional reduction and keep-dims arg
             with graph_module.graph.inserting_before(node):
diff --git a/backends/arm/_passes/decompose_add_sub_alpha_pass.py b/backends/arm/_passes/decompose_add_sub_alpha_pass.py
new file mode 100644
index 00000000000..c0ed1bae09b
--- /dev/null
+++ b/backends/arm/_passes/decompose_add_sub_alpha_pass.py
@@ -0,0 +1,94 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import numbers
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+_ADD_OPS = (
+    exir_ops.edge.aten.add.Tensor,
+    torch.ops.aten.add.Tensor,
+)
+
+_SUB_OPS = (
+    exir_ops.edge.aten.sub.Tensor,
+    torch.ops.aten.sub.Tensor,
+)
+
+
+def _get_ops(op):
+    if op in _ADD_OPS:
+        if op is exir_ops.edge.aten.add.Tensor:
+            return (
+                exir_ops.edge.aten.mul.Tensor,
+                exir_ops.edge.aten.full.default,
+                exir_ops.edge.aten.add.Tensor,
+            )
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.full.default,
+            torch.ops.aten.add.Tensor,
+        )
+    if op in _SUB_OPS:
+        if op is exir_ops.edge.aten.sub.Tensor:
+            return (
+                exir_ops.edge.aten.mul.Tensor,
+                exir_ops.edge.aten.full.default,
+                exir_ops.edge.aten.sub.Tensor,
+            )
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.full.default,
+            torch.ops.aten.sub.Tensor,
+        )
+    raise RuntimeError(f"Unsupported operator {op}")
+
+
+def _should_decompose(alpha) -> bool:
+    if isinstance(alpha, numbers.Number):
+        return alpha != 1
+    return False
+
+
+class DecomposeAddSubAlphaPass(ArmPass):
+    """Rewrite add/sub with alpha into a mul followed by add/sub."""
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call_operator(self, op, args, kwargs, meta, updated: bool | None = False):
+        if op not in _ADD_OPS + _SUB_OPS:
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        alpha = kwargs.get("alpha", 1)
+        if not _should_decompose(alpha):
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        mul_op, full_op, binary_op = _get_ops(op)
+        lhs, rhs = args
+
+        alpha_full = super().call_operator(
+            full_op, ((1,), float(alpha)), {}, meta, updated=True
+        )
+        scaled_rhs = super().call_operator(
+            mul_op,
+            (rhs, alpha_full),
+            {},
+            meta,
+            updated=True,
+        )
+        return super().call_operator(
+            binary_op,
+            (lhs, scaled_rhs),
+            {},
+            meta,
+            updated=True,
+        )
diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py
index e3e0a873020..4d4c0ee75b1 100644
--- a/backends/arm/_passes/decompose_meandim_pass.py
+++ b/backends/arm/_passes/decompose_meandim_pass.py
@@ -94,6 +94,8 @@ def call_operator(self, op, args, kwargs, meta):
         input_shape = list(x.data.shape)
         output_shape = list(meta["val"].shape)
         dims_to_reduce = get_node_arg(args, 1)
+        if dims_to_reduce is None:
+            dims_to_reduce = range(len(input_shape))
         dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
         dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1]
 
diff --git a/backends/arm/_passes/fuse_constant_ops_pass.py b/backends/arm/_passes/fuse_constant_ops_pass.py
index 07d8288b5f1..c48fc008b5d 100644
--- a/backends/arm/_passes/fuse_constant_ops_pass.py
+++ b/backends/arm/_passes/fuse_constant_ops_pass.py
@@ -114,6 +114,7 @@ def call(self, graph_module):
             if node.op != "call_function":
                 continue
             if node.target in [
+                exir_ops.backend.tosa.MATMUL.default,
                 exir_ops.backend.tosa.RESCALE.default,
                 exir_ops.backend.tosa.RESIZE.default,
                 exir_ops.backend.tosa.TABLE.default,
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
index 100ac03c2b0..d56e70e78b3 100644
--- a/backends/arm/_passes/insert_rescales_pass.py
+++ b/backends/arm/_passes/insert_rescales_pass.py
@@ -4,9 +4,14 @@
 # LICENSE file in the root directory of this source tree.
 
 from copy import copy
-from typing import cast, Set, Type
+from typing import cast, Dict, Optional, Set, Tuple, Type
 
-from executorch.backends.arm._passes.arm_pass_utils import create_node
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import create_node, set_node_arg
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_output_qparams,
+)
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -65,3 +70,234 @@ def call(self, graph_module: GraphModule) -> PassResult:
         graph_module = super().call(graph_module).graph_module
         graph_module.recompile()
         return PassResult(graph_module, modified)
+
+
+class InsertRescaleInt32Pass(ArmPass):
+    """
+    Numerous TOSA ops require inputs and outputs to be 32-bit integers in their
+    quantized implementations. This pass treats such operator nodes by
+    inserting rescale ops before and after them if needed. Note that extra logic
+    that handles the scales and zero points must be in place because the affected
+    TOSA have naive implementations that do not account for the quantization
+    parameters.
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    included_targets = [
+        exir_ops.edge.aten.abs.default,
+        exir_ops.edge.aten.eq.Tensor,
+        exir_ops.edge.aten.ge.Tensor,
+        exir_ops.edge.aten.gt.Tensor,
+        exir_ops.edge.aten.le.Tensor,
+        exir_ops.edge.aten.lt.Tensor,
+        exir_ops.edge.aten.maximum.default,
+        exir_ops.edge.aten.minimum.default,
+    ]
+
+    def _int32_qargs(self, s):
+        """Helper creator function for INT32-based QuantArgs"""
+
+        return QuantArgs(
+            scale=s,
+            zp=0,
+            qmin=torch.iinfo(torch.int32).min,
+            qmax=torch.iinfo(torch.int32).max,
+            dtype=torch.int32,
+        )
+
+    def _get_inputs_rescaled_qparams(
+        self, target, input_qparams: Dict[int, QuantArgs]
+    ) -> Dict[int, QuantArgs]:
+        """Get the qparams for the INT32 operands to the op ``target``
+
+        Inputs to the INT32-based operator must be rescaled from INT8 to INT32.
+        This function computes the ``QuantArgs`` for each of the operands and returns
+        it as a dict, mapping tensor index to ``QuantArgs``.
+        """
+
+        if target in [
+            exir_ops.edge.aten.abs.default,
+            exir_ops.edge.aten.eq.Tensor,
+            exir_ops.edge.aten.ge.Tensor,
+            exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.le.Tensor,
+            exir_ops.edge.aten.lt.Tensor,
+            exir_ops.edge.aten.minimum.default,
+            exir_ops.edge.aten.maximum.default,
+        ]:
+            # For these ops, use the smallest scale among the INT8 operands.
+            min_scale = min(
+                [qp.get_scale_per_tensor() for qp in input_qparams.values()]
+            )
+            qparams = {
+                i: self._int32_qargs(min_scale) for i in range(len(input_qparams))
+            }
+        else:
+            raise ValueError(f"Not a valid target: {target}")
+
+        return qparams
+
+    def _get_output_qparams(
+        self, target, inputs_qparams: Dict[int, QuantArgs]
+    ) -> Optional[QuantArgs]:
+        """Given an op ``target`` and the ``QuantArgs`` for each of its inputs, compute
+        the scale of the output based on how the operator itself affects it."""
+
+        if target in [
+            exir_ops.edge.aten.abs.default,
+            exir_ops.edge.aten.maximum.default,
+            exir_ops.edge.aten.minimum.default,
+        ]:
+            # The op has not altered the scale; the output scale is equal to
+            # the operands' scales.
+            return self._int32_qargs(inputs_qparams[0].get_scale_per_tensor())
+        elif target in [
+            exir_ops.edge.aten.eq.Tensor,
+            exir_ops.edge.aten.ge.Tensor,
+            exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.le.Tensor,
+            exir_ops.edge.aten.lt.Tensor,
+        ]:
+            # Output is bool for these ops and thus no qparams are present
+            return None
+        else:
+            raise ValueError(f"Not a valid target: {target}")
+
+    def _get_rescale_qparams(
+        self, target, input_qparams: Dict[int, QuantArgs]
+    ) -> Tuple[Dict[int, QuantArgs], Optional[QuantArgs]]:
+        """
+        Get the quantization parameters of the INT32 inputs/outputs that will
+        surround the node after the new RESCALE ops have been inserted.
+        """
+
+        inputs_rescaled_qparams = self._get_inputs_rescaled_qparams(
+            target, input_qparams
+        )
+        output_qparams = self._get_output_qparams(target, inputs_rescaled_qparams)
+
+        return (inputs_rescaled_qparams, output_qparams)
+
+    def _rescale_inputs(self, graph, node, rescale_qargs: Dict[int, QuantArgs]) -> bool:
+        qargs = node.meta["input_qparams"]
+
+        args_copy = list(node.args)
+        seen_args = set()
+        modified = False
+        for i in qargs:
+            qp = qargs[i]
+            if qp.dtype != torch.int8:
+                continue
+
+            arg_node = args_copy[i]
+            if arg_node in seen_args:
+                continue
+            seen_args.add(arg_node)
+
+            with graph.inserting_after(arg_node):
+                rescale_node = create_node(
+                    graph,
+                    exir_ops.backend.tosa.RESCALE.default,
+                    (
+                        arg_node,
+                        torch.int32,
+                        qp.get_scale_per_tensor()
+                        / rescale_qargs[
+                            i
+                        ].get_scale_per_tensor(),  # Old scale / new scale
+                        qp.get_zp_per_tensor(),  # Old zero point
+                        rescale_qargs[i].get_zp_per_tensor(),  # New zero point
+                    ),
+                    from_node=node,
+                )
+
+                node.replace_input_with(arg_node, rescale_node)
+                modified = True
+
+        return modified
+
+    def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> bool:
+        if "output_qparams" not in node.meta or len(node.meta["output_qparams"]) == 0:
+            return False
+
+        qargs = get_output_qparams(node)
+        assert len(qargs) == 1
+        assert rescale_qargs is not None
+
+        qarg = qargs[0]
+        if qarg.dtype != torch.int8:
+            return False
+
+        users_copy = list(node.users)
+
+        with graph.inserting_after(node):
+            rescale_node = create_node(
+                graph,
+                exir_ops.backend.tosa.RESCALE.default,
+                (
+                    node,
+                    torch.int8,
+                    rescale_qargs.get_scale_per_tensor()
+                    / qarg.get_scale_per_tensor(),  # Old scale / new scale
+                    rescale_qargs.get_zp_per_tensor(),  # Old zero point
+                    qarg.get_zp_per_tensor(),  # New zero point
+                ),
+                from_node=node,
+            )
+
+        for user in users_copy:
+            user.replace_input_with(node, rescale_node)
+
+        return True
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        graph = graph_module.graph
+
+        modified = False
+        for node in list(graph.nodes):
+            node = cast(Node, node)
+
+            if node.op != "call_function" or node.target not in self.included_targets:
+                continue
+
+            if "input_qparams" not in node.meta or len(node.meta["input_qparams"]) == 0:
+                continue
+            input_qparams = node.meta["input_qparams"]
+
+            inputs_rescale_qargs, output_rescale_qargs = self._get_rescale_qparams(
+                node.target, input_qparams
+            )
+
+            inputs_was_rescaled = self._rescale_inputs(
+                graph, node, inputs_rescale_qargs
+            )
+            outputs_was_rescaled = False
+            if inputs_was_rescaled:
+                outputs_was_rescaled = self._rescale_outputs(
+                    graph, node, output_rescale_qargs
+                )
+                modified = True
+
+            # Update node metadata
+
+            if inputs_was_rescaled:
+                assert len(inputs_rescale_qargs) == len(node.meta["input_qparams"])
+                node.meta["input_qparams"] = inputs_rescale_qargs
+
+            if outputs_was_rescaled:
+                assert len(node.meta["output_qparams"]) == 1
+                node.meta["output_qparams"] = {0: output_rescale_qargs}
+
+                # If the output type is specified in the node, change it such
+                # that it matches the subsequent rescale node(s) that this node
+                # now has output edges to.
+                if "dtype" in node.kwargs:
+                    set_node_arg(node, "dtype", torch.int32)
+
+        if modified:
+            # Retrace the graph to update the fake tensor types
+            graph_module = super().call(graph_module).graph_module
+            graph_module.recompile()
+
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/rewrite_matmul.py b/backends/arm/_passes/rewrite_matmul.py
new file mode 100644
index 00000000000..28ff800792b
--- /dev/null
+++ b/backends/arm/_passes/rewrite_matmul.py
@@ -0,0 +1,97 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_input_qparams,
+    get_output_qparams,
+)
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class RewriteMatmulPass(ArmPass):
+    """Rewrites aten.bmm to tosa.MATMUL and inserts a tosa.RESCALE op if needed."""
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def _insert_output_rescale(self, graph_module, node, tosa_matmul_node, dtype):
+        input_qparams = get_input_qparams(node)
+        output_qparams = get_output_qparams(node)[0]
+        scale = (
+            input_qparams[0].get_scale_per_tensor()
+            * input_qparams[1].get_scale_per_tensor()
+        ) / output_qparams.get_scale_per_tensor()
+
+        with graph_module.graph.inserting_after(tosa_matmul_node):
+            # If the input is int8, we need to cast the output to int32
+            rescale_node = create_node(
+                graph_module.graph,
+                op_target=exir_ops.backend.tosa.RESCALE.default,
+                from_node=tosa_matmul_node,
+            )
+            tosa_matmul_node.replace_all_uses_with(rescale_node)
+            rescale_node.args = (
+                tosa_matmul_node,
+                dtype,
+                scale,
+                0,
+                output_qparams.get_zp_per_tensor(),
+            )
+
+    def call(self, graph_module):
+        modified = False
+        for node in graph_module.graph.nodes:
+            if (
+                node.op != "call_function"
+                or node.target != exir_ops.edge.aten.bmm.default
+            ):
+                continue
+            modified = True
+
+            x1, x2 = node.args
+            tosa_matmul_target = exir_ops.backend.tosa.MATMUL.default
+            with graph_module.graph.inserting_before(node):
+                tosa_matmul_node = create_node(
+                    graph_module.graph,
+                    op_target=tosa_matmul_target,
+                    args=(x1, x2),
+                    kwargs={},
+                    from_node=node,
+                )
+                node.replace_all_uses_with(tosa_matmul_node)
+                graph_module.graph.erase_node(node)
+
+            x1_fake_tensor = get_first_fake_tensor(x1)
+            x2_fake_tensor = get_first_fake_tensor(x2)
+            output_fake_tensor = tosa_matmul_target(x1_fake_tensor, x2_fake_tensor)
+            node_output_fake_tensor = get_first_fake_tensor(node)
+            if (
+                output_fake_tensor.dtype == torch.int32
+                and node_output_fake_tensor.dtype in (torch.int8, torch.int16)
+            ):
+                self._insert_output_rescale(
+                    graph_module,
+                    node,
+                    tosa_matmul_node,
+                    dtype=node_output_fake_tensor.dtype,
+                )
+                if x1_fake_tensor.dtype == torch.int16:
+                    tosa_matmul_node.meta[TosaSpecialDtype.meta_key()] = (
+                        TosaSpecialDtype.INT48
+                    )
+
+        if modified:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/to_tosa_memory_format_pass.py b/backends/arm/_passes/to_tosa_memory_format_pass.py
index dcbdfb03f7b..b906c06b329 100644
--- a/backends/arm/_passes/to_tosa_memory_format_pass.py
+++ b/backends/arm/_passes/to_tosa_memory_format_pass.py
@@ -26,6 +26,9 @@
     NNCHW_ORDER,
     NNHWC_INVERSE_ORDER,
     NNHWC_ORDER,
+    NNNCHW_ORDER,
+    NNNHWC_INVERSE_ORDER,
+    NNNHWC_ORDER,
 )
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -51,12 +54,6 @@ class ToTosaMemoryFormatPass(ExportPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
-    NHWC_order = (0, 2, 3, 1)
-    NHWC_inverse_order = (0, 3, 1, 2)
-    HWCM_order = (2, 3, 0, 1)
-    NNHWC_order = (0, 1, 3, 4, 2)
-    NNHWC_inverse_order = (0, 1, 4, 2, 3)
-
     def __init__(self, exported_program: ExportedProgram) -> None:
         self.exported_program = exported_program
         super().__init__()
@@ -93,7 +90,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
     @staticmethod
     def memory_format_differs(shape):
         """Returns true if the shape will have a different memory layout in (N)NCHW and (N)NHWC format"""
-        if len(shape) >= 5:
+        if len(shape) >= 6:
+            C = shape[3]
+            H = shape[4]
+            W = shape[5]
+        elif len(shape) == 5:
             C = shape[2]
             H = shape[3]
             W = shape[4]
@@ -112,25 +113,26 @@ def memory_format_differs(shape):
 
     @staticmethod
     def is_channel_reshape(input_shape, output_shape):
-        """Returns true if the reshape changes the channel dimension"""
-        if not (
-            (len(input_shape) == len(output_shape) and (len(output_shape) in (4, 5)))
-            or (len(input_shape) == 4 and len(output_shape) == 5)
-            or (len(input_shape) == 5 and len(output_shape) == 4)
-        ):
+        """Returns true if reshape changes the channel dimension or batch product dimension(s)"""
+
+        valid_ranks = {4, 5, 6}
+
+        if not (len(input_shape) in valid_ranks and len(output_shape) in valid_ranks):
             return False
 
         C_old = input_shape[-3]
         C_new = output_shape[-3]
 
-        N_new = (
-            output_shape[0]
-            if len(output_shape) == 4
-            else output_shape[0] * output_shape[1]
-        )
-        N_old = (
-            input_shape[0] if len(input_shape) == 4 else input_shape[0] * input_shape[1]
-        )
+        def get_batch_prod_dim(shape):
+            product = 1
+
+            for dim in shape[:-3]:
+                product = product * dim
+
+            return product
+
+        N_old = get_batch_prod_dim(input_shape)
+        N_new = get_batch_prod_dim(output_shape)
 
         return (N_old != N_new) or (C_old != C_new)
 
@@ -141,17 +143,27 @@ def insert_input_transpose(node, input_node, graph_module):
             node.replace_input_with(input_node, pre_permute_node)
             return
 
+        if len(get_first_fake_tensor(input_node).size()) == 6:
+            mem_format = NNNHWC_INVERSE_ORDER
+        elif len(get_first_fake_tensor(input_node).size()) == 5:
+            mem_format = NNHWC_INVERSE_ORDER
+        else:
+            mem_format = NHWC_INVERSE_ORDER
+        # Guard: mem_format must be a true permutation for the current rank
+        _rank_ = len(
+            get_first_fake_tensor(input_node).size()
+        )  # or (node) in output path
+        assert sorted(mem_format) == list(
+            range(_rank_)
+        ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
                 exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     input_node,
-                    list(
-                        NNHWC_INVERSE_ORDER
-                        if len(get_first_fake_tensor(input_node).size()) == 5
-                        else NHWC_INVERSE_ORDER
-                    ),
+                    list(mem_format),
                 ),
                 from_node=node,
             )
@@ -163,26 +175,38 @@ def insert_input_transpose(node, input_node, graph_module):
 
     @staticmethod
     def insert_output_transpose(node, graph_module):
+
+        if len(get_first_fake_tensor(node).size()) == 6:
+            mem_format = NNNHWC_ORDER
+        elif len(get_first_fake_tensor(node).size()) == 5:
+            mem_format = NNHWC_ORDER
+        else:
+            mem_format = NHWC_ORDER
+        # Guard: mem_format must be a true permutation for the current rank
+        _rank_ = len(get_first_fake_tensor(node).size())  # or (node) in output path
+        assert sorted(mem_format) == list(
+            range(_rank_)
+        ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
         with graph_module.graph.inserting_after(node):
             permute_node = create_node(
                 graph_module.graph,
                 exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     node,
-                    list(
-                        NNHWC_ORDER
-                        if len(get_first_fake_tensor(node).size()) == 5
-                        else NHWC_ORDER
-                    ),
+                    list(mem_format),
                 ),
                 from_node=node,
             )
 
-            permute_node.meta["tosa_dim_order"] = (
-                NNHWC_ORDER
-                if len(get_first_fake_tensor(node).size()) == 5
-                else NHWC_ORDER
-            )
+            rank = len(get_first_fake_tensor(node).size())
+            if rank == 6:
+                permute_node.meta["tosa_dim_order"] = NNNHWC_ORDER
+            elif rank == 5:
+                permute_node.meta["tosa_dim_order"] = NNHWC_ORDER
+            else:
+                permute_node.meta["tosa_dim_order"] = NHWC_ORDER
+
             node.meta["tosa_dim_order"] = tuple(
                 range(len(get_first_fake_tensor(node).size()))
             )
@@ -261,7 +285,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
         ]
         for input_node in inputs:
             input_dim_order = get_first_fake_tensor(input_node).dim_order()
-            if input_dim_order in (NCHW_ORDER, NNCHW_ORDER):
+            if input_dim_order in (NCHW_ORDER, NNCHW_ORDER, NNNCHW_ORDER):
                 self.insert_output_transpose(input_node, graph_module)
 
         # Transpose outputs if they are in (N)NCHW format
@@ -276,6 +300,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
             if output_dim_order in (
                 NCHW_ORDER,
                 NNCHW_ORDER,
+                NNNCHW_ORDER,
             ):
                 self.insert_input_transpose(
                     output_node, output_node_input, graph_module
@@ -313,6 +338,8 @@ def call(self, graph_module: torch.fx.GraphModule):
                     dim_order = HWCM_ORDER
             elif node_data.dim() == 5:
                 dim_order = NNHWC_ORDER
+            elif node_data.dim() == 6:
+                dim_order = NNNHWC_ORDER
             else:
                 dim_order = tuple(range(node_data.dim()))  # type: ignore[assignment]
 
diff --git a/backends/arm/constants.py b/backends/arm/constants.py
index b9995410b23..0e562f12e88 100644
--- a/backends/arm/constants.py
+++ b/backends/arm/constants.py
@@ -34,10 +34,13 @@
 NHWC_INVERSE_ORDER: Final = (0, 3, 1, 2)
 NNHWC_ORDER: Final = (0, 1, 3, 4, 2)
 NNHWC_INVERSE_ORDER: Final = (0, 1, 4, 2, 3)
+NNNHWC_ORDER: Final = (0, 1, 2, 4, 5, 3)
+NNNHWC_INVERSE_ORDER: Final = (0, 1, 2, 5, 3, 4)
 
 NCHW_ORDER: Final = (0, 1, 2, 3)
-NCHW_INVERSE_ORDER: Final = (0, 2, 3, 1)
 NNCHW_ORDER: Final = (0, 1, 2, 3, 4)
-NNCHW_INVERSE_ORDER: Final = (0, 1, 3, 4, 2)
+NNNCHW_ORDER: Final = (0, 1, 2, 3, 4, 5)
 
 HWCM_ORDER: Final = (2, 3, 0, 1)
+
+MAX_RANK: Final = 6
diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index 6e9d3b3528e..f335c5046f5 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Declare operator support for ``aten.convolution`` in TOSA.
+
+Provide general checks and hardware-specific constraints (e.g., U55 subset) for
+convolution nodes prior to delegation to the TOSA backend.
+
+"""
 
 from typing import cast
 
@@ -18,6 +24,8 @@
 
 @register_tosa_support_check
 class ConvolutionSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for convolutions."""
+
     targets = [exir_ops.edge.aten.convolution.default]
 
     tosa_specs = [
@@ -25,8 +33,15 @@ class ConvolutionSupported(SupportedTOSAOperatorCheck):
         TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+    def is_node_tosa_supported(
+        self, node: fx.Node, tosa_spec: TosaSpecification
+    ) -> bool:
+        """Return True if the node is supported by TOSA.
 
+        Reject transposed convolutions and convolutions with non-zero output
+        padding. Apply additional hardware-specific constraints for U55.
+
+        """
         # Not implemented
         transposed = cast(bool, node.args[6])
         output_padding = cast(list[int], node.args[7])
@@ -46,9 +61,19 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         else:
             return True
 
-    def _is_node_supported_u55(self, node: fx.Node):
-        """Hardware constraints for Ethos-U-55 case, Vela 4.2.0 (25.02 release)"""
+    def _is_node_supported_u55(self, node: fx.Node) -> bool:
+        """Enforce Ethos-U55-specific constraints (Vela 4.2.0).
+
+        Check channel dimensions, kernel sizes, and stride/pad/dilation
+        combinations permitted on U55.
 
+        Args:
+            node (fx.Node): Convolution node to validate.
+
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
         shape_in = cast(torch.Tensor, node.all_input_nodes[0].meta["val"]).shape
         shape_out = node.meta["val"].shape
         kernel = cast(fx.Node, node.args[1]).meta["val"].shape
@@ -98,13 +123,17 @@ def _is_node_supported_u55(self, node: fx.Node):
         return True
 
     def _stride_condition(self, node: fx.Node) -> bool:
-        """This condition is somewhat complex but boils down
-        to not supporting stride > 3, unless we have some special conditions.
-        This condition is a simplified, relaxed version of the hardware constraint,
-        since the actual constraint requires information not available
-        here (without a lot of work).
+        """Check a simplified stride/padding/dilation constraint.
+
+        Disallow strides greater than 3 unless there is no padding and the
+        dilation is 1. For 3D convolutions, enforce ``stride_z <= 1``.
+
+        Args:
+            node (fx.Node): Convolution node to evaluate.
+
+        Returns:
+            bool: True if the condition is satisfied.
 
-        This means that we might accept ops that are not actually supported.
         """
         strides = cast(list[int], node.args[3])
         has_padding = any(pad > 0 for pad in cast(list[int], node.args[4]))
diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py
index 983aa091eec..27ddb95637b 100644
--- a/backends/arm/operator_support/ethos_u55_support.py
+++ b/backends/arm/operator_support/ethos_u55_support.py
@@ -2,6 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide Ethos-U55 specific operator support checks.
+
+Contains dtype validation, explicit unsupported-op filtering, and shape/
+permutation constraints for view and permute operations when targeting the
+Ethos-U55 subset of TOSA.
+
+"""
 
 # pyre-unsafe
 
@@ -21,6 +28,19 @@
 
 
 def _try_determine_dtype(node: fx.Node) -> torch.dtype | None:
+    """Return an inferred dtype for a node when possible.
+
+    Uses fake tensor metadata and nearby quantize/dequantize nodes to infer the
+    integer dtype used by the operator. Returns ``None`` when the dtype cannot
+    be determined reliably.
+
+    Args:
+        node (fx.Node): FX node to inspect.
+
+    Returns:
+        torch.dtype | None: Inferred dtype or ``None`` if unknown.
+
+    """
     dtype = get_first_fake_tensor(node).dtype
     if not dtype.is_floating_point:
         return dtype
@@ -34,8 +54,23 @@ def _try_determine_dtype(node: fx.Node) -> torch.dtype | None:
 
 
 class EthosU55DtypeSupport(OperatorSupportBase):
+    """Validate dtypes for U55-supported operators.
+
+    Ensures operators use a supported integer dtype according to U55
+    constraints, with specific rules for convolution, matmul, and table ops.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
@@ -52,7 +87,20 @@ def __init__(self, reporter: WhyNoPartitionReporter):
     def is_node_supported(  # noqa: C901
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return True if the node uses supported dtypes.
 
+        Applies per-operator dtype rules for U55, including specialized input
+        and weight constraints for convolution and int8-only checks for table
+        operations and matmul variants.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
         dtype = _try_determine_dtype(node)
         if dtype is None:
             # If we couldn't determine dtype, just return ok.
@@ -112,10 +160,12 @@ def is_node_supported(  # noqa: C901
 
 
 class EthosU55NotSupported(OperatorSupportBase):
-    """
-    Certain operators are not supported on U55. These are listed in `unsupported_ops`.
-    The comment mentions the unsupported TOSA operator that the aten operator maps to where it is not obvious.
-    For unimplemented operators, this is the anticipated mapping, and it might be incorrect.
+    """Reject operators not supported by Ethos-U55.
+
+    The ``unsupported_ops`` list contains aten ops that either map to TOSA
+    operators the U55 cannot run or remain unimplemented. The mapping comments
+    capture expected TOSA equivalents when not obvious.
+
     """
 
     unsupported_ops = [
@@ -165,12 +215,27 @@ class EthosU55NotSupported(OperatorSupportBase):
     ]
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         self.reporter = reporter
 
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return False for nodes explicitly unsupported on U55.
 
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: False if ``node.target`` is in ``unsupported_ops``; else True.
+
+        """
         if node.target in self.unsupported_ops:
             self.reporter.report_reject(node, "Op is not supported on U55.")
             return False
@@ -182,12 +247,37 @@ def is_node_supported(
 
 
 class EthosU55ViewCheck(OperatorSupportBase):
+    """Validate view/select shapes and dtypes for U55.
+
+    Performs lightweight checks on output shape rank and product constraints,
+    with awareness that transposes may be inserted around view/select during
+    lowering to channels-last.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
     def axes_product(self, nhwc_shape: shape_t) -> int:
+        """Return the product of all axes in ``nhwc_shape``.
+
+        Args:
+            nhwc_shape (list[int]): Shape in NHWC order.
+
+        Returns:
+            int: Product of the axis sizes.
+
+        """
         product = 1
         for axes in nhwc_shape:
             product *= axes
@@ -197,26 +287,27 @@ def axes_product(self, nhwc_shape: shape_t) -> int:
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
-        """
-        Check whether a given view node is supported on U55.
+        """Check whether a given view/select node is U55-supported.
 
         Currently only checks dtypes and product of axes.
 
-        It is not the view operator itself that is not supported on U55. In order for the
-        view operator to be compatible with the channels-last format of TosaBackend,
-        transposes may need to be inserted before and after the view op. If that happens
-        and that transpose operator does not adhere to the limitations then it will
-        result in the following error:
+        It is not the view operator itself that is not supported on U55. In
+        order for the view operator to be compatible with the channels-last
+        format of TosaBackend, transposes may need to be inserted before and
+        after the view op. If that happens and that transpose operator does not
+        adhere to the limitations then it will result in the following error:
 
             CPU performance estimation for "Transpose" not implemented.
             ...
             CPU operations are not supported for GraphAPI input
 
         Args:
-            node: The FX node representing the view_copy operator.
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node for ``view_copy`` or ``select``.
 
         Returns:
-            False if the operator is not support and True if it is supported.
+            bool: False if rejected by constraints; otherwise, True.
+
         """
         # Select decomposes into squeeze, which in turn becomes a view. Therefore,
         # perform the same check on select operators as view operators.
@@ -279,14 +370,40 @@ def is_node_supported(
 
 
 class EthosU55TransposeCheck(OperatorSupportBase):
+    """Validate permute nodes against U55 reshape/transpose limits.
+
+    Applies dtype- and rank-specific constraints to permutations. Tests both
+    NCHW and NHWC interpretations for rank-3/4 shapes since dim order is unknown
+    at partition time.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
     def _pad_to_rank_4(
         self, shape: shape_t, permutation: list[int]
     ) -> tuple[shape_t, shape_t]:
+        """Pad shape/permutation to rank 4 by prepending ones/indices.
+
+        Args:
+            shape (list[int]): Original shape.
+            permutation (list[int]): Original permutation indices.
+
+        Returns:
+            tuple[list[int], list[int]]: Padded shape and permutation.
+
+        """
         diff = 4 - len(shape)
         padded_shape = [1] * diff + shape
         for i in range(len(permutation)):
@@ -295,6 +412,15 @@ def _pad_to_rank_4(
         return padded_shape, padded_permutation
 
     def axes_product(self, nhwc_shape: shape_t) -> int:
+        """Return the product of all axes in ``nhwc_shape``.
+
+        Args:
+            nhwc_shape (list[int]): Shape in NHWC order.
+
+        Returns:
+            int: Product of the axis sizes.
+
+        """
         product = 1
         for axes in nhwc_shape:
             product *= axes
@@ -303,7 +429,7 @@ def axes_product(self, nhwc_shape: shape_t) -> int:
     def _permute_constraint_i8_i16(
         self, nhwc_shape: list[int], permutation: list[int]
     ) -> bool:
-        """Returns True if the constraints are ok."""
+        """Return True if permutation meets i8/i16 constraints."""
         N, H, W, C = nhwc_shape
         match permutation:
             case (0, 1, 2, 3):  # NHWC -> NHWC
@@ -316,7 +442,7 @@ def _permute_constraint_i8_i16(
     def _permute_constraint_i32(
         self, nhwc_shape: list[int], permutation: list[int]
     ) -> bool:
-        """Returns True if the constraints are ok."""
+        """Return True if permutation meets i32 constraints."""
         N, H, W, C = nhwc_shape
         match permutation:
             case (0, 1, 2, 3):  # NHWC -> NHWC
@@ -329,6 +455,7 @@ def _permute_constraint_i32(
                 return False
 
     def _permute_constraint(self, shape, permutation, dtype):
+        """Return True if permutation meets dtype-specific constraints."""
         if dtype in (torch.int8, torch.int16):
             return self._permute_constraint_i8_i16(shape, permutation)
         if dtype == torch.int32:
@@ -338,7 +465,19 @@ def _permute_constraint(self, shape, permutation, dtype):
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return True if a permute node satisfies U55 constraints.
+
+        Tests both NCHW and NHWC interpretations for rank-3/4 shapes, and
+        applies dtype-specific limits to shapes and permutations.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: True if supported; otherwise, False.
 
+        """
         if not node.target == exir_ops.edge.aten.permute_copy.default:
             return True
 
diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
index d763ef23df2..86db2d9b0b6 100644
--- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py
+++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -18,6 +18,7 @@
 
 
 # INT profile: ops supported via native TOSA ops, decompositions/transformations, precompute, TableOps, etc.
+# Note that ops supported via pre-quantization decompositions are not included here.
 TOSA_PRO_INT_SupportList: Final[Set] = {
     exir_ops.edge.aten.abs.default,
     exir_ops.edge.aten.add.Tensor,
@@ -46,8 +47,6 @@
     exir_ops.edge.aten.hardsigmoid.default,
     exir_ops.edge.aten.hardtanh.default,
     exir_ops.edge.aten.hardswish.default,
-    exir_ops.edge.aten.div.Tensor,
-    exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.eq.Tensor,
     exir_ops.edge.aten.eq.Scalar,
     exir_ops.edge.aten.erf.default,
@@ -68,16 +67,7 @@
     exir_ops.edge.aten.lt.Tensor,
     exir_ops.edge.aten.lt.Scalar,
     exir_ops.edge.aten.mul.Tensor,
-    exir_ops.edge.aten.ne.Tensor,
-    exir_ops.edge.aten.ne.Scalar,
     exir_ops.edge.aten.neg.default,
-    exir_ops.edge.aten.add.Scalar,
-    exir_ops.edge.aten.sub.Scalar,
-    exir_ops.edge.aten.mul.Scalar,
-    exir_ops.edge.aten.div.Scalar,
-    exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
-    exir_ops.edge.aten.native_layer_norm.default,
-    exir_ops.edge.aten.native_group_norm.default,
     exir_ops.edge.aten.sigmoid.default,
     exir_ops.edge.aten.mean.dim,
     exir_ops.edge.aten.mm.default,
@@ -86,19 +76,12 @@
     exir_ops.edge.aten.repeat.default,
     exir_ops.edge.aten.reciprocal.default,
     exir_ops.edge.aten.relu.default,
-    exir_ops.edge.aten.leaky_relu.default,
-    exir_ops.edge.aten.sqrt.default,
     exir_ops.edge.aten.rsqrt.default,
-    exir_ops.edge.aten.round.default,
-    exir_ops.edge.aten._softmax.default,
     exir_ops.edge.aten.select_copy.int,
-    exir_ops.edge.aten._log_softmax.default,
     exir_ops.edge.aten.sub.Tensor,
     exir_ops.edge.aten.tanh.default,
     exir_ops.edge.aten.upsample_bilinear2d.vec,
     exir_ops.edge.aten.upsample_nearest2d.vec,
-    exir_ops.edge.aten.var.correction,
-    exir_ops.edge.aten.var.dim,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.unsqueeze_copy.default,
     exir_ops.edge.aten.squeeze_copy.dims,
@@ -127,12 +110,9 @@
     exir_ops.edge.aten.sign.default,
     exir_ops.edge.aten.asin.default,
     exir_ops.edge.aten.atanh.default,
-    exir_ops.edge.aten.addmm.default,
     exir_ops.edge.aten.masked_fill.Scalar,
     exir_ops.edge.aten.asinh.default,
     exir_ops.edge.aten.cosh.default,
-    exir_ops.edge.aten.glu.default,
-    exir_ops.edge.aten.logit.default,
     exir_ops.edge.aten.acos.default,
     exir_ops.edge.aten.elu.default,
     exir_ops.edge.aten.bitwise_not.default,
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 86c53e4aff1..f7857894d40 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -19,7 +19,7 @@
     FuseQuantizedActivationPass,
 )
 from executorch.backends.arm._passes.insert_table_ops import TableOps
-from executorch.backends.arm.constants import DQ_OPS, Q_OPS
+from executorch.backends.arm.constants import DQ_OPS, MAX_RANK, Q_OPS
 from executorch.backends.arm.operator_support.ethos_u55_support import (
     EthosU55CastCheck,
     EthosU55DtypeSupport,
@@ -127,7 +127,7 @@ def tosa_support_factory(
     negative_checks: list[OperatorSupportBase] = [
         CheckInt64InputsAndOutputs(exported_program, reporter),
         CheckFloat64Inputs(exported_program, reporter),
-        RankCheck(reporter, max_rank=5),
+        RankCheck(reporter, max_rank=MAX_RANK),
         *[
             reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
             for check in (additional_checks if additional_checks else [])
@@ -135,7 +135,6 @@ def tosa_support_factory(
     ]
 
     if not tosa_spec.support_float():
-        negative_checks.append(NeedsDecompositionCheck(reporter))
         negative_checks.append(CheckProperQuantization(reporter))
     if tosa_spec.is_U55_subset:
         negative_checks.append(EthosU55NotSupported(reporter))
@@ -156,7 +155,8 @@ def tosa_support_factory(
 class TOSAProINTSupportList(OperatorSupportBase):
     """
     TOSA_PRO_INT_SupportList:
-        Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps
+        Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps.
+        Note that ops supported via pre-quantization decompositions are not included here.
     """
 
     def is_node_supported(
@@ -179,57 +179,6 @@ def is_node_supported(
         return node.op == "call_function" and node.target in TOSA_PRO_FP_SupportList
 
 
-class NeedsDecompositionCheck(OperatorSupportBase):
-    """
-    Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding
-    the operator, and to get optimal quantization parameters for each operator. This check will reject operators
-    that need to be decomposed.
-    """
-
-    def __init__(self, reporter: WhyNoPartitionReporter):
-        self.reporter = reporter
-
-    def is_node_supported(
-        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
-    ) -> bool:
-
-        if node.op != "call_function":
-            return True
-
-        needs_decomp_dict = {
-            exir_ops.edge.aten.div.Tensor: None,
-            exir_ops.edge.aten._native_batch_norm_legit_no_training.default: "BatchNorm2D with track_running_stats==True not immediately following a convolution is not supported for quantized TOSA backends.",
-            exir_ops.edge.aten.native_layer_norm.default: None,
-            exir_ops.edge.aten.native_group_norm.default: None,
-            exir_ops.edge.aten._softmax.default: None,
-            exir_ops.edge.aten._log_softmax.default: None,
-            exir_ops.edge.aten.var.correction: None,
-            exir_ops.edge.aten.var.dim: None,
-            exir_ops.edge.aten.add.Scalar: None,
-            exir_ops.edge.aten.sqrt.default: None,
-            exir_ops.edge.aten.sub.Scalar: None,
-            exir_ops.edge.aten.mul.Scalar: None,
-            exir_ops.edge.aten.ne.Tensor: None,
-            exir_ops.edge.aten.ne.Scalar: None,
-            exir_ops.edge.aten.div.Scalar: None,
-            exir_ops.edge.aten.leaky_relu.default: None,
-            exir_ops.edge.aten.round.default: None,
-            exir_ops.edge.aten.addmm.default: None,
-            exir_ops.edge.aten.glu.default: None,
-            exir_ops.edge.aten.logit.default: None,
-        }
-
-        if node.target in needs_decomp_dict:
-            reject_message = needs_decomp_dict[node.target]
-            if reject_message is None:
-                reject_message = "Op needs to be decomposed into other ops before quantization to get quantized properly."
-
-            self.reporter.report_reject(node, reject_message)
-            return False
-        else:
-            return True
-
-
 class CheckProperQuantization(OperatorSupportBase):
     """
     For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index d8b371570f6..9278d25959f 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -14,7 +14,6 @@
     op_any,
     op_avg_pool2d,
     op_bitwise_not,
-    op_bmm,
     op_cat,
     op_ceil,
     op_clamp,
@@ -42,8 +41,6 @@
     op_pow,
     op_reciprocal,
     op_repeat,
-    op_rescale,
-    op_resize,
     op_rshift_tensor,
     op_rsqrt,
     op_sigmoid,
@@ -51,10 +48,13 @@
     op_slice,
     op_sub,
     op_sum,
-    op_table,
     op_tanh,
     op_to_dim_order_copy,
-    op_transpose,
+    op_tosa_matmul,
+    op_tosa_rescale,
+    op_tosa_resize,
+    op_tosa_table,
+    op_tosa_transpose,
     op_view,
     op_where,
     ops_binary,
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index ec76eb5517f..943c4778867 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -6,9 +6,6 @@
 # pyre-unsafe
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-import executorch.backends.arm.tosa.utils as tutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -18,22 +15,20 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
 @register_node_visitor
-class AbsVisitor_INT(NodeVisitor):
+class AbsVisitor(NodeVisitor):
     target = "aten.abs.default"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def define_node(
         self,
         node: Node,
@@ -47,89 +42,18 @@ def define_node(
         validate_num_inputs(self.target, inputs, 1)
         validate_same_dtype(self.target, [*inputs, output], ts)
 
-        # Handle int8 (quantized) and int32
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        if inputs[0].dtype == ts.DType.INT8:
-            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )  # type: ignore[possibly-undefined]
-        else:
-            # input[0].dtype == ts.DType.INT32
-            # Non quantized input, natively support by TOSA.abs
-            rescaled_inputs = inputs
-
-        if output.dtype == ts.DType.INT8:
-            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
-            abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
-        else:
-            # output.dtype == ts.DType.INT32
-            abs_output = output
-
-        # Do the INT32 Abs
-        self._serialize_operator(
-            node,
-            tosa_graph,
+        tosa_graph.addOperator(
             ts.TosaOp.Op().ABS,
             [
-                rescaled_inputs[0].name,
+                inputs[0].name,
             ],
-            [abs_output.name],
+            [output.name],
             None,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # Scale output back to 8 bit
-            # pyre-ignore
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, abs_output, scale_back, node, self.tosa_spec
-            )  # type: ignore[possibly-undefined]
-
-
-@register_node_visitor
-class AbsVisitor_FP(AbsVisitor_INT):
-    # inheriting 'target' from BI class
-
-    tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")]
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-
-        import serializer.tosa_serializer as ts  # type: ignore
-
-        validate_num_inputs(self.target, inputs, 1)
-        validate_same_dtype(self.target, [*inputs, output], ts)
-
-        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
-            # Call the inherited define_node for handling integers
-            super().define_node(node, tosa_graph, inputs, output)
-        else:
-            # FP32 Abs lowering
-
-            validate_valid_dtype(
-                self.target, [*inputs, output], ts.DType.FP32, output.tosa_spec
-            )
-
-            # MI lowering
-            self._serialize_operator(
-                node,
-                tosa_graph,
-                ts.TosaOp.Op().ABS,
-                [inputs[0].name],
-                [output.name],
-                None,
-            )
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 469e6613c1f..933e353387b 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -182,11 +182,11 @@ def define_node(
             acc_type = ts.DType.FP32
 
         tosa_graph.addConst(
-            [1], output.dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
+            [1], inputs[0].dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
         )
         tosa_graph.addConst(
             [1],
-            output.dtype,
+            inputs[1].dtype,
             weight_zp,
             name=f"{conv2d_output_name}_weight_zp",
         )
@@ -269,7 +269,7 @@ def define_node(
 
         # For quantized convolution, rescale the output value back to the same
         # integer value domain of the next op. Otherwise return float32 output.
-        if inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.INT16:
+        if output.dtype == ts.DType.INT8 or output.dtype == ts.DType.INT16:
             # Get scale_factor from input, weight, and output.
             input_scale = input_qparams[0].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore [61]
             per_channel_quant = input_qparams[1].per_channel  # pyre-ignore [61]
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index 2136fe2e946..76b6e67cd8d 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,23 +54,12 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         # Do the equal comparison
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().EQUAL,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index c538e735880..4bb20cac77f 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER_EQUAL,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index d407e28c1b6..c25c959681e 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index 403c6c233d3..e62d669814f 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER_EQUAL,
-            [input_nodes[1].name, input_nodes[0].name],
+            [inputs[1].name, inputs[0].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index f5132dd4feb..cccb0abd5d7 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER,
-            [input_nodes[1].name, input_nodes[0].name],
+            [inputs[1].name, inputs[0].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py
index 66437f8af1d..50c6e06a4bb 100644
--- a/backends/arm/operators/op_maximum.py
+++ b/backends/arm/operators/op_maximum.py
@@ -7,12 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -22,9 +16,8 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
+from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
@@ -56,35 +49,12 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        max_output = output
-        if inputs[0].dtype == ts.DType.INT8:
-            input_qparams = get_input_qparams(node)
-            if len(input_qparams) != 2:
-                raise ValueError(
-                    f"Both inputs need to have quantization information for {node}"
-                )
-            if input_qparams[0] != input_qparams[1]:
-                raise ValueError(
-                    "Both inputs must have the same quantization parameters for MAX"
-                )
-
-            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            output.shape = tosa_shape(output.shape, output.dim_order)
-            max_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-        else:
-            operand_inputs = inputs
-
         attr_maximum = ts.TosaSerializerAttribute()
-
-        # Set to PROPOGATE as default
+        # Set to PROPAGATE as default
         attr_maximum.MaximumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
 
         self._serialize_operator(
@@ -92,15 +62,9 @@ def define_node(
             tosa_graph,
             ts.TosaOp.Op().MAXIMUM,
             [
-                operand_inputs[0].name,
-                operand_inputs[1].name,
+                inputs[0].name,
+                inputs[1].name,
             ],
-            [max_output.name],
+            [output.name],
             attr_maximum,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # insert RESCALE from int32 back to int8
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, max_output, scale_back, node, self.tosa_spec
-            )
diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py
index 518366d5463..d5b97f186d3 100644
--- a/backends/arm/operators/op_minimum.py
+++ b/backends/arm/operators/op_minimum.py
@@ -7,11 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -23,7 +18,6 @@
 )
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
 from torch.fx import Node
 
 
@@ -55,35 +49,12 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        min_output = output
-        if inputs[0].dtype == ts.DType.INT8:
-            input_qparams = get_input_qparams(node)
-            if len(input_qparams) != 2:
-                raise ValueError(
-                    f"Both inputs need to have quantization information for {node}"
-                )
-            if input_qparams[0] != input_qparams[1]:
-                raise ValueError(
-                    "Both inputs must have the same quantization parameters for MIN"
-                )
-
-            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            output.shape = tosa_shape(output.shape, output.dim_order)
-            min_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-        else:
-            operand_inputs = inputs
-
         attr_minimum = ts.TosaSerializerAttribute()
-
-        # Set to PROPOGATE as default
+        # Set to PROPAGATE as default
         attr_minimum.MinimumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
 
         self._serialize_operator(
@@ -91,15 +62,9 @@ def define_node(
             tosa_graph,
             ts.TosaOp.Op().MINIMUM,
             [
-                operand_inputs[0].name,
-                operand_inputs[1].name,
+                inputs[0].name,
+                inputs[1].name,
             ],
-            [min_output.name],
+            [output.name],
             attr_minimum,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # insert RESCALE from int32 back to int8
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, min_output, scale_back, node, self.tosa_spec
-            )
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index 5db7ce9347c..9ee4e9fedf8 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -44,7 +44,7 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT8, ts.DType.INT32, ts.DType.INT16, ts.DType.FP32],
             output.tosa_spec,
         )
 
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_tosa_matmul.py
similarity index 50%
rename from backends/arm/operators/op_bmm.py
rename to backends/arm/operators/op_tosa_matmul.py
index 2636a08d7c5..b177fd2ba37 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_tosa_matmul.py
@@ -13,7 +13,6 @@
 
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
-    get_output_qparams,
 )
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -26,20 +25,13 @@
 )
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.quant_utils import build_rescale
-from tosa.RoundingMode import RoundingMode  # type: ignore
 
 
 @register_node_visitor
-class BMMVisitor(NodeVisitor):
-    """Provide a visitor that lowers ``aten.bmm`` to TOSA ``MATMUL``.
+class MatmulVisitor(NodeVisitor):
+    """Provide a visitor that serializes TOSA ``MATMUL``."""
 
-    INT8 accumulates into INT32; add a rescale to INT8 using SINGLE_ROUND
-    rounding and output zero-point.
-
-    """
-
-    target = "aten.bmm.default"
+    target = "tosa.MATMUL.default"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
@@ -56,35 +48,36 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        """Define the TOSA ``MATMUL`` operator and optional rescale."""
+        """Define the TOSA ``MATMUL`` operator."""
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output], ts)
+        validate_same_dtype(self.target, [*inputs], ts)
         validate_valid_dtype(
             self.target,
-            [*inputs, output],
+            [*inputs],
             [ts.DType.INT8, ts.DType.INT16, ts.DType.FP32],
             output.tosa_spec,
         )
+        validate_valid_dtype(
+            self.target,
+            [output],
+            [ts.DType.INT32, ts.DType.INT48, ts.DType.FP32],
+            output.tosa_spec,
+        )
 
-        # aten.bmm maps directly to MATMUL
-
-        # For INT8, we need to get the zero points and add an intermediate tensor
-        # for a later rescale.
-
-        if inputs[0].dtype == ts.DType.INT8:
+        # We need to get the zero points and add an intermediate tensor for INT16 case
+        if inputs[0].dtype in (ts.DType.INT8, ts.DType.INT16):
             input_qparams = get_input_qparams(node)
             input0_zp = input_qparams[0].get_zp_per_tensor()
             input1_zp = input_qparams[1].get_zp_per_tensor()
-            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-            bmm_output_name = bmm_result.name
         else:
-            bmm_output_name = output.name
             input0_zp, input1_zp = 0, 0
 
-        tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=f"{node.name}_A_ZP")
-        tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=f"{node.name}_B_ZP")
+        input_A_ZP_name = f"{node.name}_A_ZP"
+        input_B_ZP_name = f"{node.name}_B_ZP"
+        tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=input_A_ZP_name)
+        tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=input_B_ZP_name)
 
         # Add the MATMUL to the TOSA graph.
         self._serialize_operator(
@@ -94,27 +87,8 @@ def define_node(
             [
                 inputs[0].name,
                 inputs[1].name,
-                f"{node.name}_A_ZP",
-                f"{node.name}_B_ZP",
+                input_A_ZP_name,
+                input_B_ZP_name,
             ],
-            [bmm_output_name],
+            [output.name],
         )
-
-        # As INT8 accumulates into INT32, we need to rescale it back to INT8
-        if output.dtype == ts.DType.INT8:
-            output_qparams = get_output_qparams(node)[0]
-            final_output_scale = (
-                input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore[61]
-            ) / output_qparams.get_scale_per_tensor()
-
-            build_rescale(
-                tosa_fb=tosa_graph,
-                scale=[final_output_scale],
-                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
-                input_node=bmm_result,  # type: ignore[possibly-undefined]
-                output_name=output.name,
-                output_type=ts.DType.INT8,
-                input_zp=[0],
-                output_zp=[output_qparams.get_zp_per_tensor()],
-                rounding_mode=RoundingMode.SINGLE_ROUND,
-            )
diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_tosa_rescale.py
similarity index 100%
rename from backends/arm/operators/op_rescale.py
rename to backends/arm/operators/op_tosa_rescale.py
diff --git a/backends/arm/operators/op_resize.py b/backends/arm/operators/op_tosa_resize.py
similarity index 100%
rename from backends/arm/operators/op_resize.py
rename to backends/arm/operators/op_tosa_resize.py
diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_tosa_table.py
similarity index 100%
rename from backends/arm/operators/op_table.py
rename to backends/arm/operators/op_tosa_table.py
diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_tosa_transpose.py
similarity index 100%
rename from backends/arm/operators/op_transpose.py
rename to backends/arm/operators/op_tosa_transpose.py
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index d7c85447dd5..349aa3e6b21 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -6,7 +6,7 @@
 import logging
 import operator
 from dataclasses import dataclass
-from typing import Callable, List, Optional, Sequence
+from typing import Callable, cast, List, Optional, Sequence
 
 import torch
 import torch.fx
@@ -137,11 +137,18 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
     node since histc op (in HistogramObserver) only works for values up to certain upper
     bound.
     """
+    HISTC_UPPER_BOUND = 3.4028235e15
     if node.op == "get_attr" and isinstance(node.target, str):
         tensor = _get_node_target(gm, node.target)
         # torch.histc works until this upper bound
-        HISTC_UPPER_BOUND = 3.4028235e15
         return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
+    if node.op == "call_function" and node.target in (
+        torch.ops.aten.full.default,
+        torch.ops.aten.full,
+        torch.ops.aten.fill_.Scalar,
+    ):
+        fill_value = cast(float, node.args[1])
+        return abs(fill_value) > HISTC_UPPER_BOUND
     return False
 
 
@@ -358,14 +365,13 @@ def _match_pattern(
     torch.ops.aten.permute_copy.default,
     torch.ops.aten.avg_pool2d.default,
     torch.ops.aten.max_pool2d.default,
-    torch.ops.aten.full.default,
-    torch.ops.aten.full,
-    torch.ops.aten.fill_.Scalar,
     torch.ops.aten.flatten.using_ints,
     torch.ops.aten.dropout.default,
     torch.ops.aten.dropout_.default,
     torch.ops.aten.adaptive_avg_pool2d.default,
     torch.ops.aten.alias_copy.default,
+    torch.ops.aten.pixel_shuffle.default,
+    torch.ops.aten.pixel_unshuffle.default,
 ]
 
 
@@ -518,9 +524,6 @@ def any_or_hardtanh_min_zero(n: Node):
         ]
         quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
     elif node.target in _one_to_one_shared_input_or_input_act_qspec:
-        if not isinstance(node.args[0], Node):
-            return None
-
         input_qspec = (
             SharedQuantizationSpec(node.args[0])  # type: ignore[arg-type]
             if is_output_annotated(node.args[0])  # type: ignore
@@ -578,7 +581,12 @@ def any_or_hardtanh_min_zero(n: Node):
             ),
         ]
         quant_properties.quant_output = None
-    elif node.target in [torch.ops.aten.scalar_tensor.default]:
+    elif node.target in [
+        torch.ops.aten.scalar_tensor.default,
+        torch.ops.aten.full.default,
+        torch.ops.aten.full,
+        torch.ops.aten.fill_.Scalar,
+    ]:
         quant_properties.quant_inputs = []
         quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
     elif node.target in [operator.getitem]:
diff --git a/backends/arm/requirements-arm-ethos-u.txt b/backends/arm/requirements-arm-ethos-u.txt
index a26fb014234..9076aa08852 100644
--- a/backends/arm/requirements-arm-ethos-u.txt
+++ b/backends/arm/requirements-arm-ethos-u.txt
@@ -3,4 +3,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-ethos-u-vela == 4.4.0
+ethos-u-vela == 4.4.1
\ No newline at end of file
diff --git a/backends/arm/requirements-arm-tosa.txt b/backends/arm/requirements-arm-tosa.txt
index 0f9c2f702a4..16aa01a6c23 100644
--- a/backends/arm/requirements-arm-tosa.txt
+++ b/backends/arm/requirements-arm-tosa.txt
@@ -8,4 +8,4 @@ flatbuffers == 24.3.25
 tosa-adapter-model-explorer == 0.0.1
 ai-edge-model-explorer >= 0.1.16
 
-tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.0
+tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.1
diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp
index abb4c50d8be..fa8c7ead220 100644
--- a/backends/arm/runtime/VGFSetup.cpp
+++ b/backends/arm/runtime/VGFSetup.cpp
@@ -24,6 +24,13 @@ namespace vgf {
 /* static function to map format to byte count */
 static uint32_t get_format_size(VkFormat format);
 
+// SPV_ARM_tensor does not support rank-0 representations according to the spec.
+// Use an unsqueezed dimension when the resource table contains an empty
+// shape. Tensors are output as rank 0 when copied back from the vgf backend.
+namespace {
+constexpr int64_t kScalarSentinelDimension = 1;
+}
+
 // Debug function to inspect memory properties
 static string memory_flags_to_string(VkMemoryPropertyFlags flags) {
   if (flags == 0)
@@ -264,7 +271,11 @@ static void debug_print_resources(
             the_shape.size(),
             the_stride.size());
         for (int j = 0; j < the_shape.size(); j++) {
-          ET_LOG(Info, "      %d: dim %ld", j, the_shape[j]);
+          ET_LOG(
+              Info,
+              "      %d: dim %lld",
+              j,
+              static_cast<long long>(the_shape[j]));
         }
         // Allocate a tensor with bound memory
         break;
@@ -387,6 +398,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
     // Get tensor shape and strides
     auto shape = resource_decoder->getTensorShape(i);
     auto stride = resource_decoder->getTensorStride(i);
+    const auto shape_size = shape.size();
 
     switch (resource_decoder->getCategory(i)) {
       case vgflib::ResourceCategory::INPUT:
@@ -409,9 +421,9 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
         result = allocate_tensor(
             vk_physical,
             vk_device,
-            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
-            static_cast<uint32_t>(shape.size()),
-            shape.begin(),
+            resource_format,
+            shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+            shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
             static_cast<uint32_t>(stride.size()),
             stride.begin(),
             &tensor_description,
@@ -422,8 +434,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
           ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i);
           return false;
         }
-        size_t e_size = get_format_size(
-            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)));
+        size_t e_size = get_format_size(resource_format);
         if (0 == e_size) {
           ET_LOG(Error, "failed to get element size of VkFormat");
           return false;
@@ -449,9 +460,11 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
             .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
             .pNext = nullptr,
             .tiling = VK_TENSOR_TILING_LINEAR_ARM,
-            .format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
-            .dimensionCount = static_cast<uint32_t>(shape.size()),
-            .pDimensions = shape.begin(),
+            .format = resource_format,
+            .dimensionCount =
+                shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+            .pDimensions =
+                shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
             // Note: stride_data of 0's causes size==0, null means stride==size
             .pStrides = (0 == stride.size() ? nullptr : stride.begin()),
             .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
diff --git a/backends/arm/scripts/build_executor_runner_vkml.sh b/backends/arm/scripts/build_executor_runner_vkml.sh
index 1df63acc425..afca02c6299 100755
--- a/backends/arm/scripts/build_executor_runner_vkml.sh
+++ b/backends/arm/scripts/build_executor_runner_vkml.sh
@@ -69,6 +69,7 @@ cmake \
     -DCMAKE_BUILD_TYPE=${build_type}            \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
diff --git a/backends/arm/scripts/mlsdk_utils.sh b/backends/arm/scripts/mlsdk_utils.sh
index 7a7d2585e52..2c6553df3d3 100755
--- a/backends/arm/scripts/mlsdk_utils.sh
+++ b/backends/arm/scripts/mlsdk_utils.sh
@@ -38,6 +38,28 @@ function download_ai_mlsdk_manifest() {
                --manifest-url ${mlsdk_manifest_url} \
                --manifest-branch ${mlsdk_manifest_tag} \
                -g model-converter,emulation-layer,vgf-library
+
+# Update dependencies to use gitlab tosa-mlir-translator
+# Do not indent the xml. Heredoc indentation is significant.
+mkdir -p .repo/local_manifests/
+cat > ".repo/local_manifests/tosa_gitlab.xml" <<'XML'
+<manifest>
+  <remote name="gitlab" fetch="https://git.gitlab.arm.com/"/>
+
+  <!-- remove the mlplatform entry -->
+  <remove-project name="tosa/tosa_mlir_translator"/>
+
+  <!-- re-add with GitLab repo and pin the SHA -->
+  <project
+      name="tosa/tosa-mlir-translator"
+      path="dependencies/tosa_mlir_translator"
+      remote="gitlab"
+      revision="refs/tags/v2025.07.1"
+      groups="all model-converter"
+      sync-s="true"/>
+</manifest>
+XML
+
         ./repo sync -j$(nproc)
 
         popd
@@ -109,7 +131,7 @@ function setup_mlsdk() {
             -DSPIRV_TOOLS_PATH=../../dependencies/SPIRV-Tools        \
             -DVULKAN_HEADERS_PATH=../../dependencies/Vulkan-Headers
 
-        cmake --build build
+        cmake --build build -j$(nproc)
         cmake --install build --prefix deploy
         popd
     fi
diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index 2629d8eb257..54f8aa7421d 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -26,6 +26,8 @@
     "_native_batch_norm_legit_no_training.default",
     "_native_batch_norm_legit.no_stats",
     "alias_copy.default",
+    "pixel_shuffle.default",
+    "pixel_unshuffle.default",
 ]
 ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
 
diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
index ec35b63f8f6..fd7d894fbf0 100644
--- a/backends/arm/test/TARGETS
+++ b/backends/arm/test/TARGETS
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_arm_tests")
 
@@ -58,6 +63,7 @@ runtime.python_library(
         "//executorch/backends/arm/quantizer:lib",
         "//executorch/backends/arm/tosa:mapping",
         "//executorch/backends/arm:vgf",
+        "//executorch/backends/arm:_factory",
         "//executorch/devtools/backend_debug:delegation_info",
         "//executorch/exir/backend:operator_support",
         "fbsource//third-party/pypi/tabulate:tabulate",
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 963084d6091..3b5dd8bd4db 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -14,6 +14,7 @@
 
 import pytest
 from executorch.backends.arm.ethosu import EthosUCompileSpec
+
 from executorch.backends.arm.test.runner_utils import (
     arm_executor_runner_exists,
     corstone300_installed,
@@ -226,6 +227,7 @@ def parametrize(
     test_data: dict[str, Any],
     xfails: dict[str, xfail_type] | None = None,
     strict: bool = True,
+    flakies: dict[str, int] | None = None,
 ):
     """
     Custom version of pytest.mark.parametrize with some syntatic sugar and added xfail functionality
@@ -236,12 +238,17 @@ def parametrize(
     """
     if xfails is None:
         xfails = {}
+    if flakies is None:
+        flakies = {}
 
     def decorator_func(func):
         """Test data is transformed from a dict of (id, data) pairs to a list of pytest params to work with the native pytests parametrize function"""
         pytest_testsuite = []
         for id, test_parameters in test_data.items():
-            if id in xfails:
+            if id in flakies:
+                # Mark this parameter as flaky with given reruns
+                marker = (pytest.mark.flaky(reruns=flakies[id]),)
+            elif id in xfails:
                 xfail_info = xfails[id]
                 reason = ""
                 raises = None
diff --git a/backends/arm/test/misc/test_multiple_delegates.py b/backends/arm/test/misc/test_multiple_delegates.py
index f716bc45385..8dad25f4180 100644
--- a/backends/arm/test/misc/test_multiple_delegates.py
+++ b/backends/arm/test/misc/test_multiple_delegates.py
@@ -23,7 +23,7 @@ class MultipleDelegatesModule(torch.nn.Module):
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         z = x + y
-        s = torch.tan(z)
+        s = torch.max(z)
         return s * z
 
 
diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
index 49266beee63..fad31b57537 100644
--- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm._passes import (
@@ -17,11 +17,17 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     CLIP_text_encoder_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
 from transformers import CLIPTextModelWithProjection
 
+input_t = Tuple[torch.Tensor]
+
 
-class TestCLIPTextModelWithProjection(unittest.TestCase):
+class TestCLIPTextModelWithProjection:
     """
     Test class of CLIPTextModelWithProjection.
     CLIPTextModelWithProjection is one of the text_encoder used by Stable Diffusion 3.5 Medium
@@ -69,47 +75,93 @@ def prepare_model_and_inputs(self):
 
         return text_encoder_model, text_encoder_model_inputs
 
-    def test_CLIPTextModelWithProjection_tosa_FP(self):
-        text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    text_encoder_model,
-                    example_inputs=text_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                    transform_passes=[
-                        ConvertInt64ConstOpsToInt32Pass(),
-                        ConvertInt64OutputOpsToInt32Pass(),
-                        InsertInt32CastsAfterInt64PlaceholdersPass(),
-                    ],
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=text_encoder_model_inputs,
-                )
-            )
-
-    def test_CLIPTextModelWithProjection_tosa_INT(self):
-        text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    text_encoder_model,
-                    example_inputs=text_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=text_encoder_model_inputs,
-                    atol=0.8,
-                )
-            )
+
+def test_CLIPTextModelWithProjection_tosa_FP():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_CLIPTextModelWithProjection_tosa_INT():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            atol=0.8,
+        )
+        pipeline.change_args(
+            "check_count.exir",
+            TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_FP():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            atol=4,  # TODO: Investiage numerical issue: MAX Diff ~50%
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_INT():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            atol=0.8,
+        )
+        pipeline.change_args(
+            "check_count.exir",
+            TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
index f9d814d044b..9506fe727db 100644
--- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from diffusers.models.transformers import SD3Transformer2DModel
@@ -13,10 +13,16 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     SD3Transformer2DModel_init_dict,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+input_t4 = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
 
 
-class TestSD3Transformer2DModel(unittest.TestCase):
+class TestSD3Transformer2DModel:
     """
     Test class of AutoenSD3Transformer2DModelcoderKL.
     SD3Transformer2DModel is the transformer model used by Stable Diffusion 3.5 Medium
@@ -24,16 +30,12 @@ class TestSD3Transformer2DModel(unittest.TestCase):
 
     # Adjust nbr below as we increase op support.
     ops_after_partitioner_FP = {
-        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
         "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1,
         "torch.ops.higher_order.executorch_call_delegate": 1,
     }
 
     ops_after_partitioner_INT = {
-        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
         "torch.ops.higher_order.executorch_call_delegate": 2,
     }
@@ -93,48 +95,88 @@ def forward(self, *args, **kwargs):
 
         return sd35_transformer2D_model, sd35_transformer2D_model_inputs
 
-    def test_SD3Transformer2DModel_tosa_FP(self):
-        sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
-            self.prepare_model_and_inputs()
-        )
-        with torch.no_grad():
-            (
-                ArmTester(
-                    sd35_transformer2D_model,
-                    example_inputs=sd35_transformer2D_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=sd35_transformer2D_model_inputs,
-                    rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
-                    atol=4.0,
-                )
-            )
 
-    def test_SD3Transformer2DModel_tosa_INT(self):
-        sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
-            self.prepare_model_and_inputs()
+def test_SD3Transformer2DModel_tosa_FP():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            atol=4.0,
         )
-        with torch.no_grad():
-            (
-                ArmTester(
-                    sd35_transformer2D_model,
-                    example_inputs=sd35_transformer2D_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=sd35_transformer2D_model_inputs,
-                    qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
-                    rtol=1.0,
-                    atol=4.0,
-                )
-            )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_SD3Transformer2DModel_tosa_INT():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            rtol=1.0,
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_FP():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_INT():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            rtol=1.0,
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
index 22a47042eb1..20b92e4a258 100644
--- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm._passes import (
@@ -17,11 +17,17 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     T5_encoder_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
 from transformers import T5EncoderModel
 
+input_t = Tuple[torch.Tensor]
+
 
-class TestT5EncoderModel(unittest.TestCase):
+class TestT5EncoderModel:
     """
     Test class of T5EncoderModel.
     T5EncoderModel is one of the text_encoder used by Stable Diffusion 3.5 Medium
@@ -61,46 +67,88 @@ def prepare_model_and_inputs(self):
 
         return t5_encoder_model, t5_encoder_model_inputs
 
-    def test_T5EncoderModel_tosa_FP(self):
-        t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    t5_encoder_model,
-                    example_inputs=t5_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                    transform_passes=[
-                        ConvertInt64ConstOpsToInt32Pass(),
-                        ConvertInt64OutputOpsToInt32Pass(),
-                        InsertInt32CastsAfterInt64PlaceholdersPass(),
-                    ],
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=t5_encoder_model_inputs,
-                )
-            )
-
-    def test_T5EncoderModel_tosa_INT(self):
-        t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    t5_encoder_model,
-                    example_inputs=t5_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=t5_encoder_model_inputs,
-                )
-            )
+
+def test_T5EncoderModel_tosa_FP():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_T5EncoderModel_tosa_INT():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_FP():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_INT():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
index ab0f4892fb8..a3c3a018131 100644
--- a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
+++ b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from diffusers.models.autoencoders import AutoencoderKL
@@ -14,10 +14,16 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     AutoencoderKL_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+input_t = Tuple[torch.Tensor]
 
 
-class TestAutoencoderKL(unittest.TestCase):
+class TestAutoencoderKL:
     """
     Test class of AutoencoderKL.
     AutoencoderKL is the encoder/decoder used by Stable Diffusion 3.5 Medium
@@ -41,40 +47,68 @@ def forward(self, *args, **kwargs):
 
         return auto_encoder_model, auto_encoder_model_inputs
 
-    def test_AutoencoderKL_tosa_FP(self):
-        auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    auto_encoder_model,
-                    example_inputs=auto_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=auto_encoder_model_inputs,
-                )
-            )
-
-    def test_AutoencoderKL_tosa_INT(self):
-        auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    auto_encoder_model,
-                    example_inputs=auto_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=auto_encoder_model_inputs,
-                    atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
-                )
-            )
+
+def test_AutoencoderKL_tosa_FP():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.run()
+
+
+def test_AutoencoderKL_tosa_INT():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_FP():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_INT():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/test_resnet18.py b/backends/arm/test/models/test_resnet18.py
index 1c1011ec967..3cb21abd772 100644
--- a/backends/arm/test/models/test_resnet18.py
+++ b/backends/arm/test/models/test_resnet18.py
@@ -23,7 +23,8 @@
 model = model.eval()
 normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 
-model_inputs = (normalize(torch.randn((1, 3, 224, 224))),)
+# Using torch.rand * 2 - 1 to generate numbers in the range [-1;1] like an RGB image
+model_inputs = (normalize(torch.rand((1, 3, 224, 224)) * 2 - 1),)
 
 input_t = Tuple[torch.Tensor]
 
@@ -71,7 +72,7 @@ def test_resnet_u55_INT(per_channel_quantization):
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        atol=0.5,
+        atol=0.25,
         qtol=1,
     )
     pipeline.run()
@@ -91,7 +92,7 @@ def test_resnet_u85_INT(per_channel_quantization):
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        atol=0.5,
+        atol=0.25,
         qtol=1,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 9b3f98763c6..09c9d8fa224 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -7,7 +7,6 @@
 
 from typing import cast, Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer import arm_quantizer
 from executorch.backends.arm.quantizer.arm_quantizer import (
@@ -78,7 +77,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 class Add3(torch.nn.Module):
     def forward(self, x: torch.Tensor, y: torch.Tensor):
-        return x + y
+        return torch.add(x, y, alpha=1.5)
 
     test_data: list[input_t2] = {
         "3d_randn_diff_rank": lambda: (torch.randn(1, 4, 5), torch.randn(4, 1)),
@@ -260,9 +259,6 @@ def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", Add.test_data)
-@pytest.mark.xfail(
-    reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13730"
-)
 def test_add_tensor_16a8w_tosa_INT(test_data: input_t1):
     """Test add operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
index b9a891ec740..685b69b3541 100644
--- a/backends/arm/test/ops/test_addmm.py
+++ b/backends/arm/test/ops/test_addmm.py
@@ -167,7 +167,6 @@ def test_addmm_u85_INT(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_addmm_vgf_FP(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
@@ -181,7 +180,6 @@ def test_addmm_vgf_FP(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_addmm_vgf_INT(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
@@ -213,9 +211,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979"
-)
 def test_addmm_16a8w_tosa_INT(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -268,9 +263,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations"
-)
 def test_addmm_16a8w_u85_INT16(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py
index 99529e07ca2..e69e9163325 100644
--- a/backends/arm/test/ops/test_amax.py
+++ b/backends/arm/test/ops/test_amax.py
@@ -139,7 +139,6 @@ def test_max_dim_tosa_FP_not_delegated():
 
 @common.parametrize("test_data", Amax.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amax_vgf_FP(test_data: Amax.input_t):
     data, dim, keep_dims = test_data()
     module = Amax(dim, keep_dims)
@@ -154,7 +153,6 @@ def test_amax_vgf_FP(test_data: Amax.input_t):
 
 @common.parametrize("test_data", Amax.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amax_vgf_INT(test_data: Amax.input_t):
     data, dim, keep_dims = test_data()
     module = Amax(dim, keep_dims)
@@ -169,7 +167,6 @@ def test_amax_vgf_INT(test_data: Amax.input_t):
 
 @common.parametrize("test_data", Max.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_max_dim_vgf_FP_to_amax(test_data: Max.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Max.input_t](
@@ -183,7 +180,6 @@ def test_max_dim_vgf_FP_to_amax(test_data: Max.input_t):
 
 @common.parametrize("test_data", Max.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_max_dim_vgf_INT_to_amax(test_data: Max.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Max.input_t](
diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py
index 1526ed21b89..09d9018c73e 100644
--- a/backends/arm/test/ops/test_amin.py
+++ b/backends/arm/test/ops/test_amin.py
@@ -29,12 +29,16 @@ def __init__(self, dim, keep_dims):
         super().__init__()
 
     def forward(self, x):
-        return torch.amin(x, self.dim, self.keep_dims)
+        if self.dim is None:
+            return torch.amin(x, keepdim=self.keep_dims)
+        else:
+            return torch.amin(x, self.dim, self.keep_dims)
 
-    test_data: Dict[str, input_t] = {
+    test_data: Dict = {
         "rank_1_dim_0": lambda: ((torch.rand([10]),), 0, False),
         "rank_2_dim_1_keep_dims": lambda: ((torch.rand([2, 2]),), (1,), True),
         "rank_4_all_dim": lambda: ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
+        "rank_4_no_dim": lambda: ((torch.rand([1, 2, 5, 5]),), None, False),
         "rank_4_0,3_keep_dims": lambda: ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
         "rank_4_mult_batches": lambda: ((torch.rand([2, 2, 2, 2]),), (0), True),
     }
@@ -52,7 +56,7 @@ def forward(self, x):
         x = torch.min(x, self.dim)
         return x[0]
 
-    test_data: Dict[str, input_t] = {
+    test_data: Dict = {
         "rank_1_dim_0": lambda: ((torch.rand([10]),), 0),
         "rank_2_dim_1": lambda: ((torch.rand([2, 2]),), 1),
         "rank_4_dim_2": lambda: ((torch.rand([2, 2, 2, 2]),), 2),
@@ -151,7 +155,6 @@ def test_min_dim_tosa_FP_not_delegated():
 
 @common.parametrize("test_data", Amin.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amin_vgf_FP(test_data: Amin.input_t):
     data, dim, keep_dims = test_data()
     pipeline = VgfPipeline[Amin.input_t](
@@ -162,7 +165,6 @@ def test_amin_vgf_FP(test_data: Amin.input_t):
 
 @common.parametrize("test_data", Amin.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_amin_vgf_INT(test_data: Amin.input_t):
     data, dim, keep_dims = test_data()
     pipeline = VgfPipeline[Amin.input_t](
@@ -176,7 +178,6 @@ def test_amin_vgf_INT(test_data: Amin.input_t):
 
 @common.parametrize("test_data", Min.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_min_dim_vgf_FP_to_amin(test_data: Min.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Min.input_t](
@@ -190,7 +191,6 @@ def test_min_dim_vgf_FP_to_amin(test_data: Min.input_t):
 
 @common.parametrize("test_data", Min.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_min_dim_vgf_INT_to_amin(test_data: Min.input_t):
     data, dim = test_data()
     pipeline = VgfPipeline[Min.input_t](
diff --git a/backends/arm/test/ops/test_any.py b/backends/arm/test/ops/test_any.py
index 1676018f0ce..3eccff0a64e 100644
--- a/backends/arm/test/ops/test_any.py
+++ b/backends/arm/test/ops/test_any.py
@@ -6,7 +6,6 @@
 
 from typing import List, Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -189,7 +188,6 @@ def test_any_u85_INT(test_data: input_t1):
 
 @common.parametrize("test_data", test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_any_vgf_FP(test_data: input_t1):
     op, data_fn = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -204,7 +202,6 @@ def test_any_vgf_FP(test_data: input_t1):
 
 @common.parametrize("test_data", test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_any_vgf_INT(test_data: input_t1):
     op, data_fn = test_data()
     pipeline = VgfPipeline[input_t1](
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index f18d4c997a5..f69b1419c8d 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -146,7 +146,11 @@ def test_bmm_vgf_FP(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", BMMSingleInput.test_data_generators)
+@common.parametrize(
+    "test_data",
+    BMMSingleInput.test_data_generators,
+    flakies={"rand_big_1": 3},
+)
 @common.SkipIfNoModelConverter
 def test_bmm_vgf_FP_single_input(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index a410240d310..254edbc411f 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -8,7 +8,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -178,9 +177,6 @@ def get_symmetric_a16w8_cat_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", Cat.test_parameters)
-@pytest.mark.xfail(
-    reason="missing int16 cat ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13978"
-)
 def test_cat_16a8w_tosa_INT(test_data: Tuple):
     """Test cat operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -206,9 +202,6 @@ def test_cat_16a8w_tosa_INT(test_data: Tuple):
 
 @common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
-)
 def test_cat_16a8w_u55_INT16(test_data: Tuple):
     """Test cat operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -233,9 +226,6 @@ def test_cat_16a8w_u55_INT16(test_data: Tuple):
 
 @common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
-)
 def test_cat_16a8w_u85_INT16(test_data: Tuple):
     """Test cat operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index bd719954ff5..4029fcef54e 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -8,8 +8,6 @@
 
 from typing import Tuple
 
-import pytest
-
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -313,12 +311,8 @@ def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
+@common.parametrize("test_data", test_data_all_16a8w)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Ethos-U55 A16W8 linear: int16 matmul not yet supported; pending backend support or linear->conv1x1 lowering. See: https://github.com/pytorch/executorch/issues/13947",
-    strict=False,
-)
 def test_linear_16a8w_u55_INT16(test_data: torch.Tensor):
     """Test linear operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     test_data, out_features, has_bias, per_channel_quantization = test_data()
@@ -347,12 +341,8 @@ def test_linear_16a8w_u55_INT16(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
+@common.parametrize("test_data", test_data_all_16a8w)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Ethos-U55 A16W8 linear: int16 matmul not yet supported; pending backend support or linear->conv1x1 lowering. See: https://github.com/pytorch/executorch/issues/13947",
-    strict=False,
-)
 def test_linear_16a8w_u85_INT16(test_data: torch.Tensor):
     """Test linear operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     test_data, out_features, has_bias, per_channel_quantization = test_data()
diff --git a/backends/arm/test/ops/test_matmul.py b/backends/arm/test/ops/test_matmul.py
index a788fc00a5d..f564672e98f 100644
--- a/backends/arm/test/ops/test_matmul.py
+++ b/backends/arm/test/ops/test_matmul.py
@@ -22,6 +22,7 @@
 
 class MatMul(torch.nn.Module):
     test_data_generators = {
+        "rand_rand_2d": lambda: (torch.rand(5, 5), torch.rand(5, 2)),
         "rand_rand_3d": lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
         "rand_rand_4d": lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
     }
@@ -32,6 +33,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 class MatMulSingleInput(torch.nn.Module):
     test_data_generators = {
+        "rand_2d": lambda: (torch.rand(5, 5),),
         "rand_3d": lambda: (torch.rand(2, 5, 5),),
         "rand_4d": lambda: (torch.rand(1, 2, 5, 5),),
     }
@@ -42,6 +44,11 @@ def forward(self, x: torch.Tensor):
 
 class MatMulCombo(torch.nn.Module):
     test_data_generators = {
+        "rand_rand_rand_2d": lambda: (
+            torch.rand(5, 5),
+            torch.rand(5, 2),
+            torch.rand(2, 5),
+        ),
         "rand_rand_rand_3d": lambda: (
             torch.rand(2, 5, 5),
             torch.rand(2, 5, 2),
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 96ec7793551..970340c352b 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -4,7 +4,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -84,7 +83,6 @@ def test_adaptive_avg_pool2d_u85_INT(test_data):
 
 @common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_adaptive_avg_pool2d_vgf_FP(test_data):
     pipeline = VgfPipeline[input_t](
         AdaptiveAveragePool2d(),
@@ -98,7 +96,6 @@ def test_adaptive_avg_pool2d_vgf_FP(test_data):
 
 @common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_adaptive_avg_pool2d_vgf_INT(test_data):
     pipeline = VgfPipeline[input_t](
         AdaptiveAveragePool2d(),
@@ -115,7 +112,7 @@ class MeanDim(torch.nn.Module):
     test_data_suite: dict[str, tuple] = {
         "rank_1_keepdim": lambda: (
             torch.rand(7),
-            (0),
+            0,
             True,
         ),
         "rank_2_keepdim": lambda: (
@@ -168,6 +165,11 @@ class MeanDim(torch.nn.Module):
             (0, 1, 2, 3),
             True,
         ),
+        "rand_none_keepdim": lambda: (
+            torch.rand(1, 5, 7, 3),
+            None,
+            True,
+        ),
         "rank_1": lambda: (
             torch.rand(7),
             (-1),
@@ -280,7 +282,6 @@ def test_mean_dim_tosa_INT(test_data):
         (test_data,),
         [],  # Might be sum, avgpool, or both
         symmetric_io_quantization=True,
-        custom_path="MEANDIM",
     )
     pipeline.run()
 
@@ -327,7 +328,6 @@ def test_mean_dim_u85_INT(test_data):
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_mean_dim_vgf_FP(test_data):
     test_data_val, dim, keep_dim = test_data()
     pipeline = VgfPipeline[input_t](
@@ -342,7 +342,6 @@ def test_mean_dim_vgf_FP(test_data):
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_mean_dim_vgf_INT(test_data):
     test_data_val, dim, keep_dim = test_data()
     pipeline = VgfPipeline[input_t](
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index e3f2096e7da..02447e40c4e 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -8,7 +8,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -310,9 +309,6 @@ def get_symmetric_a16w8_mul_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 mul ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
     """Test mul operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -338,9 +334,6 @@ def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
     """Test mul operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -365,9 +358,6 @@ def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_u85_INT16(test_data: input_t1):
     """Test mul operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_multihead_attention.py b/backends/arm/test/ops/test_multihead_attention.py
index c7998e2235e..cbc2ccb32f4 100644
--- a/backends/arm/test/ops/test_multihead_attention.py
+++ b/backends/arm/test/ops/test_multihead_attention.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -69,7 +68,6 @@ def test_multihead_attention_tosa_INT(test_data):
     "test_data",
     test_suite,
 )
-@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
 @common.XfailIfNoCorstone300
 def test_multihead_attention_u55_INT(test_data: input_t1):
     test_data, module = test_data()
@@ -90,7 +88,6 @@ def test_multihead_attention_u55_INT(test_data: input_t1):
     "test_data",
     test_suite,
 )
-@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
 @common.XfailIfNoCorstone320
 def test_multihead_attention_u85_INT(test_data: input_t1):
     test_data, module = test_data()
diff --git a/backends/arm/test/ops/test_pixel_shuffling.py b/backends/arm/test/ops/test_pixel_shuffling.py
new file mode 100644
index 00000000000..5aeb8b2d1bb
--- /dev/null
+++ b/backends/arm/test/ops/test_pixel_shuffling.py
@@ -0,0 +1,233 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import pytest
+
+import torch
+
+from executorch.backends.arm.constants import MAX_RANK
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+from torch import nn
+
+aten_op_pixel_unshuffle = "torch.ops.aten.pixel_unshuffle.default"
+exir_op_pixel_unshuffle = (
+    "executorch_exir_dialects_edge__ops_aten_pixel_unshuffle_default"
+)
+
+aten_op_pixel_shuffle = "torch.ops.aten.pixel_shuffle.default"
+exir_op_pixel_shuffle = "executorch_exir_dialects_edge__ops_aten_pixel_shuffle_default"
+
+input_t1 = Tuple[torch.Tensor]  # single positional input (1-tuple)
+
+max_rank_input_supported = MAX_RANK - 2
+
+
+class PixelUnShuffle(nn.Module):
+
+    upscale_factor = 2
+    test_data_generators = {
+        "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+        "test_4d": lambda: (torch.tensor([[[[10.0, 20.0], [30.0, 40.0]]]]),),
+        "test_3d": lambda: (torch.tensor([[[10.0, 20.0], [30.0, 40.0]]]),),
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.space_to_depth = nn.PixelUnshuffle(self.upscale_factor)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if inputs.dim() > max_rank_input_supported:
+            raise RuntimeError(
+                f"Max rank of input for pixel_unshuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+            )
+        return self.space_to_depth(inputs)
+
+
+class PixelShuffle(nn.Module):
+
+    upscale_factor = 2
+    test_data_generators = {
+        "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+        "test_4d": lambda: (torch.tensor([[[[10.0]], [[20.0]], [[30.0]], [[40.0]]]]),),
+        "test_3d": lambda: (torch.tensor([[[10.0]], [[20.0]], [[30.0]], [[40.0]]]),),
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.depth_to_space = nn.PixelShuffle(self.upscale_factor)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if inputs.dim() > max_rank_input_supported:
+            raise RuntimeError(
+                f"Max rank of input for pixel_shuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+            )
+        return self.depth_to_space(inputs)
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_FP(test_data: input_t1):
+    pipeline = TosaPipelineFP[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_INT(test_data: input_t1):
+    pipeline = TosaPipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_FP(test_data: input_t1):
+    pipeline = TosaPipelineFP[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_INT(test_data: input_t1):
+    pipeline = TosaPipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_unshuffle_u55_INT(test_data: input_t1):
+    pipeline = EthosU55PipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_unshuffle_u85_INT(test_data: input_t1):
+    pipeline = EthosU85PipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_shuffle_u55_INT(test_data: input_t1):
+    pipeline = EthosU55PipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_shuffle_u85_INT(test_data: input_t1):
+    pipeline = EthosU85PipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_scalar_tensor.py b/backends/arm/test/ops/test_scalar_tensor.py
index ecc2fece223..d5e5b365da1 100644
--- a/backends/arm/test/ops/test_scalar_tensor.py
+++ b/backends/arm/test/ops/test_scalar_tensor.py
@@ -2,7 +2,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import pytest
 import torch
 from executorch.backends.arm.test import common
 
@@ -102,7 +101,6 @@ def test_scalar_tensor_u85_INT(test_data):
 
 @common.parametrize("test_data", float_test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_scalar_tensor_vgf_FP(test_data):
     scalar, dtype, data = test_data()
     pipeline = VgfPipeline(
@@ -116,7 +114,6 @@ def test_scalar_tensor_vgf_FP(test_data):
 
 @common.parametrize("test_data", int_test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_scalar_tensor_vgf_INT(test_data):
     scalar, dtype, data = test_data()
     pipeline = VgfPipeline(
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index b47295f967b..23046c34fe4 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -7,7 +7,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 
 from executorch.backends.arm.test import common
@@ -170,7 +169,6 @@ def test_select_int_u85_INT(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_FP_copy(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectCopy(), test_data(), aten_op_copy, [], tosa_version="TOSA-1.0+FP"
@@ -180,7 +178,6 @@ def test_select_int_vgf_FP_copy(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_FP(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectInt(), test_data(), aten_op_int, [], tosa_version="TOSA-1.0+FP"
@@ -190,7 +187,6 @@ def test_select_int_vgf_FP(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_INT_copy(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectCopy(),
@@ -204,7 +200,6 @@ def test_select_int_vgf_INT_copy(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_select_int_vgf_INT(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         SelectInt(),
diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py
index 25117ef89de..362358d0813 100644
--- a/backends/arm/test/ops/test_silu.py
+++ b/backends/arm/test/ops/test_silu.py
@@ -8,7 +8,6 @@
 
 from typing import Optional, Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -125,7 +124,6 @@ def test_silu_u85_INT_inplace(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_FP(test_data: input_t):
     silu_data = (test_data(), False)
     pipeline = VgfPipeline[input_t](
@@ -136,7 +134,6 @@ def test_silu_vgf_FP(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_FP_inplace(test_data: input_t):
     silu_data = (test_data(), True)
     pipeline = VgfPipeline[input_t](
@@ -147,7 +144,6 @@ def test_silu_vgf_FP_inplace(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_INT(test_data: input_t):
     silu_data = (test_data(), False)
     pipeline = VgfPipeline[input_t](
@@ -161,7 +157,6 @@ def test_silu_vgf_INT(test_data: input_t):
 
 @common.parametrize("test_data", Silu.test_data)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1387: Output differs")
 def test_silu_vgf_INT_inplace(test_data: input_t):
     silu_data = (test_data(), True)
     pipeline = VgfPipeline[input_t](
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index b4bbf60d1bd..7e71a51899a 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -7,7 +7,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -154,9 +153,6 @@ def get_symmetric_a16w8_slice_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 slice ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13976"
-)
 def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -182,9 +178,6 @@ def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
-)
 def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -209,9 +202,6 @@ def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
-)
 def test_slice_tensor_16a8w_u85_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/ops/test_stack.py b/backends/arm/test/ops/test_stack.py
new file mode 100644
index 00000000000..873a599992a
--- /dev/null
+++ b/backends/arm/test/ops/test_stack.py
@@ -0,0 +1,150 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+test_data_suite = {
+    # (test_name, test_data)
+    "ones_two_tensors": lambda: ((torch.ones(1), torch.ones(1)), 0),
+    "ones_and_rand_three_tensors": lambda: (
+        (torch.ones(1, 2), torch.randn(1, 2), torch.randn(1, 2)),
+        1,
+    ),
+    "ones_and_rand_four_tensors": lambda: (
+        (
+            torch.ones(1, 2, 5),
+            torch.randn(1, 2, 5),
+            torch.randn(1, 2, 5),
+            torch.randn(1, 2, 5),
+        ),
+        -1,
+    ),
+    "rand_two_tensors": lambda: (
+        (torch.randn(2, 2, 4), torch.randn(2, 2, 4)),
+        2,
+    ),
+    "rand_two_tensors_dim_0": lambda: (
+        (torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)),
+    ),
+    "rand_two_tensors_dim_2": lambda: (
+        (torch.randn(2, 2, 3, 5), torch.randn(2, 2, 3, 5)),
+        2,
+    ),
+    "rand_large": lambda: (
+        (
+            10000 * torch.randn(2, 3, 1, 4),
+            torch.randn(2, 3, 1, 4),
+            torch.randn(2, 3, 1, 4),
+        ),
+        -3,
+    ),
+}
+
+
+class Stack(nn.Module):
+    aten_op = "torch.ops.aten.stack.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_cat_default"
+
+    def forward(self, n: tuple[torch.Tensor, ...], dim: int = 0):
+        return torch.stack(n, dim)
+
+
+input_t1 = Tuple[torch.Tensor]
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_FP(test_module: input_t1):
+    test_data = test_module()
+    pipeline = TosaPipelineFP[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = TosaPipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u55_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = EthosU55PipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_ops=Stack.aten_op,
+        exir_ops=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u85_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = EthosU85PipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_ops=Stack.aten_op,
+        exir_ops=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_FP(test_module: input_t1):
+    test_data = test_module()
+    pipeline = VgfPipeline[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        tosa_version="TOSA-1.0+FP",
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = VgfPipeline[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        tosa_version="TOSA-1.0+INT",
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 9c02243f30f..68b6ad5fb93 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -79,6 +79,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x - y
 
 
+class SubAlpha(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return torch.sub(x, y, alpha=5)
+
+
 class SubTan(torch.nn.Module):
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
@@ -115,6 +120,18 @@ def test_sub_tensor_tosa_FP_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
+@common.parametrize("test_data", sub_tan_test_data)
+def test_sub_tensor_tosa_FP_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction with alpha (TOSA FP)"""
+    pipeline = TosaPipelineFP[input_t2](
+        SubAlpha(),
+        test_data(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", sub_test_data)
 def test_sub_tensor_tosa_INT(test_data):
     """Test Subtraction (TOSA INT)"""
@@ -138,6 +155,15 @@ def test_sub_tensor_tosa_INT_3(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
+@common.parametrize("test_data", sub_tan_test_data)
+def test_sub_tensor_tosa_INT_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction with alpha (TOSA INT)"""
+    pipeline = TosaPipelineINT[input_t2](
+        SubAlpha(), test_data(), aten_op, exir_op, qtol=0
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", sub_test_data)
 @common.XfailIfNoCorstone300
 def test_sub_tensor_u55_INT(test_data):
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index f08e4498cc5..9f1c437fc65 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -6,7 +6,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 
 from executorch.backends.arm.test import common
@@ -214,7 +213,6 @@ def test_var_dim_u85_INT_no_dim(test_data: Tuple):
 
 @common.parametrize("test_data", Var.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_FP_no_dim(test_data: Tuple):
     data, keepdim, correction = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -225,7 +223,6 @@ def test_var_dim_vgf_FP_no_dim(test_data: Tuple):
 
 @common.parametrize("test_data", Var.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_INT_no_dim(test_data: Tuple):
     data, keepdim, correction = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -296,7 +293,6 @@ def test_var_dim_u85_INT(test_data: Tuple):
 
 @common.parametrize("test_data", VarDim.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_FP(test_data: Tuple):
     data, dim, keepdim, unbiased = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -307,7 +303,6 @@ def test_var_dim_vgf_FP(test_data: Tuple):
 
 @common.parametrize("test_data", VarDim.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_INT(test_data: Tuple):
     data, dim, keepdim, unbiased = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -377,7 +372,6 @@ def test_var_dim_u85_INT_correction(test_data: Tuple):
 
 @common.parametrize("test_data", VarCorrection.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_FP_correction(test_data: Tuple):
     data, dim, keepdim, corr = test_data()
     pipeline = VgfPipeline[input_t1](
@@ -388,7 +382,6 @@ def test_var_dim_vgf_FP_correction(test_data: Tuple):
 
 @common.parametrize("test_data", VarCorrection.test_parameters)
 @common.SkipIfNoModelConverter
-@pytest.mark.xfail(reason="MLETORCH-1410: Tensor dimension count not supported: 0")
 def test_var_dim_vgf_INT_correction(test_data: Tuple):
     data, dim, keepdim, corr = test_data()
     pipeline = VgfPipeline[input_t1](
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 915eef45755..3e706ae1cac 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -9,7 +9,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -180,9 +179,6 @@ def get_symmetric_a16w8_view_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", View.needs_transpose_tests)
-@pytest.mark.xfail(
-    reason="missing int16 view ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13977"
-)
 def test_view_16a8w_tosa_INT(test_data: Tuple):
     """Test view operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -209,9 +205,6 @@ def test_view_16a8w_tosa_INT(test_data: Tuple):
 
 @common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 view operations"
-)
 def test_view_16a8w_u55_INT16(test_data: Tuple):
     """Test view operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -237,9 +230,6 @@ def test_view_16a8w_u55_INT16(test_data: Tuple):
 
 @common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 view operations"
-)
 def test_view_16a8w_u85_INT16(test_data: Tuple):
     """Test view operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
diff --git a/backends/arm/test/passes/test_insert_rescale_i32_pass.py b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
new file mode 100644
index 00000000000..096c90d330d
--- /dev/null
+++ b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes import (
+    FoldAndAnnotateQParamsPass,
+    InsertRescaleInt32Pass,
+)
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+
+class NeedsRescaleOps(torch.nn.Module):
+    """A module containing ops that require INT32 inputs/outputs."""
+
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        a = torch.maximum(x, y)
+        b = torch.abs(a)
+        c = a > b
+        return c
+
+    def get_inputs(self, dtype) -> input_t:
+        if dtype == torch.float32:
+            return (torch.rand(1, 3, 5, 6), torch.rand(1, 3, 5, 6))
+        elif dtype == torch.int32:
+            return (
+                torch.randint(3, 5, (3,), dtype=torch.int32),
+                torch.randint(3, 5, (3,), dtype=torch.int32),
+            )
+        else:
+            raise ValueError("Not a valid input dtype for model")
+
+
+def test_insert_rescales():
+    module = NeedsRescaleOps()
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+    ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    ops_after = {
+        # "number of op nodes with i8 output" + "number of i8 node inputs"
+        "executorch_exir_dialects_backend__ops_tosa_RESCALE_default": 2
+        + 5,
+    }
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(torch.float32),
+        quantize=True,
+        ops_not_before_pass=ops_not_before,
+        ops_after_pass=ops_after,
+        pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+
+def test_dont_insert_rescales():
+    module = NeedsRescaleOps()
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+    ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    # All inputs are already i32. Rescales should not be added.
+    ops_not_after = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(torch.int32),
+        ops_not_before_pass=ops_not_before,
+        ops_not_after_pass=ops_not_after,
+        pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index be87ea629d8..b8e8aee4e3a 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -366,5 +366,20 @@ test_smaller_stories_llama() {
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
+test_memory_allocation() {
+    echo "${TEST_SUITE_NAME}: Test ethos-u memory allocation with run.sh"
+
+    mkdir -p arm_test/test_run
+    # Ethos-U85
+    echo "${TEST_SUITE_NAME}: Test target Ethos-U85"
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=examples/arm/example_modules/add.py &> arm_test/test_run/full.log
+    python3 backends/arm/test/test_memory_allocator_log.py --log arm_test/test_run/full.log \
+            --require "model_pte_program_size" "<= 3000 B" \
+            --require "method_allocator_planned" "<= 64 B" \
+            --require "method_allocator_loaded" "<= 1024 B" \
+            --require "method_allocator_input" "<= 4 B" \
+            --require "Total DRAM used" "<= 0.06 KiB"
+    echo "${TEST_SUITE_NAME}: PASS"
+}
 
 ${TEST_SUITE}
diff --git a/backends/arm/test/test_memory_allocator_log.py b/backends/arm/test/test_memory_allocator_log.py
new file mode 100644
index 00000000000..3853b60b7f6
--- /dev/null
+++ b/backends/arm/test/test_memory_allocator_log.py
@@ -0,0 +1,170 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Check log files for memory metrics and compare them against thresholds.
+
+Usage example:
+  python3 test_memory_allocator_log.py \
+    --log path/to/log.txt \
+    --require "Total SRAM used" "<= 310 KiB" \
+    --require "method_allocator_input" "<= 4 B"
+"""
+
+import argparse
+import re
+import sys
+from typing import List, Optional, Tuple
+
+
+def unit_factor(u: str) -> float:
+    if not u:
+        return 1.0
+    ul = u.strip().lower()
+    table = {
+        "b": 1,
+        "byte": 1,
+        "bytes": 1,
+        "kb": 1000,
+        "mb": 1000**2,
+        "gb": 1000**3,
+        "kib": 1024,
+        "mib": 1024**2,
+        "gib": 1024**3,
+    }
+    if ul in table:
+        return float(table[ul])
+    return 1.0
+
+
+def parse_value(text_num: str, text_unit: Optional[str]) -> float:
+    return float(text_num) * unit_factor(text_unit or "")
+
+
+def parse_cond(cond: str) -> Tuple[str, float, str]:
+    # Regexp explained. Example of things it will parse:
+    # "< 310 KiB", ">=10MB", "== 42", "!=3 bytes", "<=0.5 MiB"
+
+    # The regexp explained in detail:
+    # ^: anchor the match to the start and end of the string (no extra chars allowed).
+    # \s*: optional whitespace (spaces, tabs, etc.).
+    # (<=|>=|==|!=|<|>): capturing group 1. One of the comparison operators: <=, >=, ==, !=, <, >.
+    # \s*: optional whitespace.
+    # ([0-9]+(?:\.[0-9]+)?): capturing group 2. A number:
+    #   [0-9]+: one or more digits (the integer part).
+    #   (?:\.[0-9]+)?: optional non-capturing group for a fractional part like .25.
+    # \s*: optional whitespace between number and unit
+    # ([A-Za-z]+)?: capturing group 3, optional. A unit made of letters only (e.g., B, KB, KiB, MB, MiB). Case# insensitive by class choice.
+    # \s*: optional trailing whitespace.
+    m = re.match(
+        r"^\s*(<=|>=|==|!=|<|>)\s*([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?\s*$", cond
+    )
+    if not m:
+        raise ValueError(f"Invalid condition: {cond}")
+    op, num, unit = m.groups()
+    return op, float(num), (unit or "")
+
+
+def compare(a: float, b: float, op: str) -> bool:
+    return {
+        "<": a < b,
+        "<=": a <= b,
+        ">": a > b,
+        ">=": a >= b,
+        "==": abs(a - b) < 1e-9,
+        "!=": abs(a - b) >= 1e-9,
+    }[op]
+
+
+def find_metric_value(line: str, label: str) -> Tuple[Optional[str], Optional[str]]:
+    # Same regexp as parse_cond() but without the first group of matching comparison operators
+    # First go, search for the pattern but escape and ignore cases
+    # The regexp:
+    # ([0-9]+(?:\.[0-9]+)?) — capturing group 1: a decimal number
+    # [0-9]+ — one or more digits (integer part)
+    # (?:\.[0-9]+)? — optional fractional part like .25 (non-capturing)
+    # \s* — optional whitespace between number and unit
+    # ([A-Za-z]+)? — capturing group 2 (optional): a unit made only of letters (e.g., B, KB, KiB, MB)
+    m = re.search(
+        re.escape(label) + r".*?([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?",
+        line,
+        flags=re.IGNORECASE,
+    )
+    if m:
+        return m.group(1), m.group(2)
+    # Second go, same regexp as above but not caring about label. If
+    # no number was tied to a label be happy just salvaging it from
+    # the line
+    m = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?", line)
+    if m:
+        return m.group(1), m.group(2)
+    return None, None
+
+
+def first_line_with_label(lines: List[str], label: str) -> Optional[str]:
+    label_lc = label.lower()
+    return next((ln for ln in lines if label_lc in ln.lower()), None)
+
+
+def check_requirement(label: str, cond: str, lines: List[str]) -> Optional[str]:
+    op, thr_num, thr_unit = parse_cond(cond)
+    matched = first_line_with_label(lines, label)
+    if matched is None:
+        return f"{label}: not found in log"
+
+    num_str, unit_str = find_metric_value(matched, label)
+    if num_str is None:
+        return f"{label}: value not found on line: {matched.strip()}"
+
+    left_bytes = parse_value(num_str, unit_str)
+    right_bytes = parse_value(str(thr_num), thr_unit or (unit_str or ""))
+    ok = compare(left_bytes, right_bytes, op)
+
+    human_left = f"{num_str} {unit_str or 'B'}"
+    human_right = f"{thr_num:g} {thr_unit or (unit_str or 'B')}"
+    print(
+        f"[check] {label}: {human_left} {op} {human_right} -> {'OK' if ok else 'FAIL'}"
+    )
+
+    if ok:
+        return None
+    return f"{label}: {human_left} not {op} {human_right}"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log", required=True, help="Path to log file")
+    parser.add_argument(
+        "--require",
+        action="append",
+        nargs=2,
+        metavar=("LABEL", "COND"),
+        default=[],
+        help="""Required label and condition consisting
+                         of a number and unit. Example: \"Total DRAM
+                         used\" \"<= 0.06 KiB\"""",
+    )
+    args = parser.parse_args()
+
+    with open(args.log, "r", encoding="utf-8", errors="ignore") as f:
+        lines = f.readlines()
+
+    failures: List[str] = []
+    for label, cond in args.require:
+        msg = check_requirement(label, cond, lines)
+        if msg:
+            failures.append(msg)
+
+    if failures:
+        print("Failures:")
+        for msg in failures:
+            print(" - " + msg)
+        return 1
+
+    print("All checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 8bf72827549..0cba8d987c0 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -28,17 +28,11 @@
 
 import torch.fx
 import torch.utils._pytree as pytree
-
 from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
 
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-    VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.test.runner_utils import (
     dbg_tosa_fb_to_json,
     get_output_quantization_params,
@@ -53,9 +47,13 @@
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.tosa.mapping import extract_tensor_meta
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
 
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.util._factory import (
+    create_partitioner,
+    create_quantizer,
+    parse_compile_spec,
+)
+from executorch.backends.arm.vgf import VgfCompileSpec
 
 from executorch.backends.test.harness.error_statistics import ErrorStatistics
 from executorch.backends.test.harness.stages import Stage, StageType
@@ -83,7 +81,6 @@
     _copy_module,
     _update_exported_program_graph_module,
 )
-
 from tabulate import tabulate
 
 from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
@@ -103,12 +100,6 @@ def _dump_lowered_modules_artifact(
         artifact.exported_program().graph_signature
     )
 
-    def get_output_format(lowered_module) -> str | None:
-        for spec in lowered_module.compile_specs:
-            if spec.key == "output_format":
-                return spec.value.decode()
-        return None
-
     for node in graph_module.graph.nodes:
         if node.op == "get_attr" and node.name.startswith("lowered_module_"):
             lowered_module = getattr(graph_module, node.name)
@@ -116,13 +107,13 @@ def get_output_format(lowered_module) -> str | None:
                 lowered_module, LoweredBackendModule
             ), f"Attribute {node.name} must be of type LoweredBackendModule."
 
-            output_format = get_output_format(lowered_module)
-            if output_format == "tosa":
+            compile_spec = parse_compile_spec(lowered_module.compile_specs)
+            if isinstance(compile_spec, TosaCompileSpec):
                 tosa_fb = lowered_module.processed_bytes
                 to_print = dbg_tosa_fb_to_json(tosa_fb)
                 to_print = pformat(to_print, compact=True, indent=1)
                 output += f"\nTOSA deserialized {node.name}: \n{to_print}\n"
-            elif output_format == EthosUCompileSpec.get_output_format():
+            elif isinstance(compile_spec, EthosUCompileSpec):
                 vela_cmd_stream = lowered_module.processed_bytes
                 output += f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n"
             else:
@@ -284,13 +275,7 @@ def quantize(
         quantize_stage: Optional[tester.Quantize] = None,
     ):
         if quantize_stage is None:
-            quantizer = None
-            if isinstance(self.compile_spec, TosaCompileSpec):
-                quantizer = TOSAQuantizer(self.compile_spec)
-            elif isinstance(self.compile_spec, EthosUCompileSpec):
-                quantizer = EthosUQuantizer(self.compile_spec)
-            elif isinstance(self.compile_spec, VgfCompileSpec):
-                quantizer = VgfQuantizer(self.compile_spec)
+            quantizer = create_quantizer(self.compile_spec)
             quantize_stage = tester.Quantize(
                 quantizer,
                 get_symmetric_quantization_config(),
@@ -312,14 +297,7 @@ def to_edge(
 
     def partition(self, partition_stage: Optional[Partition] = None):
         if partition_stage is None:
-            if isinstance(self.compile_spec, TosaCompileSpec):
-                arm_partitioner = TOSAPartitioner(self.compile_spec)
-            elif isinstance(self.compile_spec, EthosUCompileSpec):
-                arm_partitioner = EthosUPartitioner(self.compile_spec)
-            elif isinstance(self.compile_spec, VgfCompileSpec):
-                arm_partitioner = VgfPartitioner(self.compile_spec)
-            else:
-                raise ValueError("compile spec doesn't target any Arm Partitioner")
+            arm_partitioner = create_partitioner(self.compile_spec)
             partition_stage = Partition(arm_partitioner)
         return super().partition(partition_stage)
 
@@ -329,7 +307,7 @@ def to_edge_transform_and_lower(
         partitioners: Optional[List[Partitioner]] = None,
         edge_compile_config: Optional[EdgeCompileConfig] = None,
         additional_checks: Optional[
-            List[Union[DontPartition | DontPartitionModule | DontPartitionName]]
+            List[DontPartition | DontPartitionModule | DontPartitionName]
         ] = None,
         transform_passes: Optional[
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
@@ -343,20 +321,9 @@ def to_edge_transform_and_lower(
 
         if to_edge_and_lower_stage is None:
             if partitioners is None:
-                if isinstance(self.compile_spec, TosaCompileSpec):
-                    arm_partitioner = TOSAPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                elif isinstance(self.compile_spec, EthosUCompileSpec):
-                    arm_partitioner = EthosUPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                elif isinstance(self.compile_spec, VgfCompileSpec):
-                    arm_partitioner = VgfPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                else:
-                    raise ValueError("compile spec doesn't target any Arm Partitioner")
+                arm_partitioner = create_partitioner(
+                    self.compile_spec, additional_checks
+                )
                 partitioners = [arm_partitioner]
             to_edge_and_lower_stage = ToEdgeTransformAndLower(
                 partitioners,
@@ -463,6 +430,10 @@ def run_method_and_compare_outputs(
         for run_iteration in range(num_runs):
             reference_input = inputs if inputs else next(self.generate_random_inputs())
 
+            # Avoid issues with inplace operators
+            test_input = copy.deepcopy(reference_input)
+            original_input = copy.deepcopy(reference_input)
+
             input_shapes = [
                 generated_input.shape if hasattr(generated_input, "shape") else (1,)
                 for generated_input in reference_input
@@ -477,16 +448,16 @@ def run_method_and_compare_outputs(
                 # Run exported module directly
                 test_outputs, _ = pytree.tree_flatten(
                     self._calculate_reference_output(
-                        exported_program.module(), reference_input
+                        exported_program.module(), test_input
                     )
                 )
             else:
                 # Run lowered model with target
                 test_outputs, _ = pytree.tree_flatten(
-                    test_stage.run_artifact(reference_input)
+                    test_stage.run_artifact(test_input)
                 )
 
-            logger.info(f"\n      Input: {reference_input}")
+            logger.info(f"\n      Input: {original_input}")
             logger.info(f"\n Ref output: {reference_outputs}")
             logger.info(f"\nTest output: {test_outputs}")
 
@@ -743,22 +714,19 @@ def _get_tosa_operator_distribution(
     op_list = []
     id = 0
     while lowered_module := getattr(graph_module, f"lowered_module_{id}", None):
-        for spec in lowered_module.compile_specs:
-            if spec.key != "output_format":
-                continue
-            if spec.value == b"tosa":
-                tosa_fb = lowered_module.processed_bytes
-                tosa_json = dbg_tosa_fb_to_json(tosa_fb)
-                for region in tosa_json["regions"]:
-                    for block in region["blocks"]:
-                        op_list.extend(
-                            [operator["op"] for operator in block["operators"]]
-                        )
-                break
-            elif spec.value == EthosUCompileSpec.get_output_format().encode():
-                return "Can not get operator distribution for Vela command stream."
-            else:
-                return f"Unknown output format '{spec.value}'."
+        compile_spec = parse_compile_spec(lowered_module.compile_specs)
+        if isinstance(compile_spec, TosaCompileSpec):
+            tosa_fb = lowered_module.processed_bytes
+            tosa_json = dbg_tosa_fb_to_json(tosa_fb)
+            for region in tosa_json["regions"]:
+                for block in region["blocks"]:
+                    op_list.extend([operator["op"] for operator in block["operators"]])
+        elif isinstance(compile_spec, EthosUCompileSpec):
+            return "Can not get operator distribution for Vela command stream."
+        elif isinstance(compile_spec, VgfCompileSpec):
+            return "Can not get operator distribution for VGF."
+        else:
+            return f"Unknown output format '{compile_spec.get_output_format()}'."
         id += 1
     if id == 0:
         return "No delegate with name 'lowered_module_0 found in graph module."
diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py
index 7596573be84..7a7ea2ca377 100644
--- a/backends/arm/tosa/backend.py
+++ b/backends/arm/tosa/backend.py
@@ -206,8 +206,8 @@ def filter_tosa_compile_specs(
         hardware.
         """
 
-        new_compile_spec = TosaCompileSpec.__new__(TosaCompileSpec)
-        new_compile_spec._set_compile_specs(
-            compile_spec.tosa_spec, [], compile_spec.get_intermediate_path()
+        return (
+            TosaCompileSpec(compile_spec.tosa_spec)
+            .dump_intermediate_artifacts_to(compile_spec.get_intermediate_path())
+            .dump_debug_info(compile_spec.tosa_debug_mode)
         )
-        return new_compile_spec
diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py
index f1e3a29ac22..897de70279f 100644
--- a/backends/arm/tosa/dialect/__init__.py
+++ b/backends/arm/tosa/dialect/__init__.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from executorch.backends.arm.tosa.dialect.ops import (  # noqa F401
+    matmul,
     rescale,
     resize,
     table,
diff --git a/backends/arm/tosa/dialect/ops/matmul.py b/backends/arm/tosa/dialect/ops/matmul.py
new file mode 100644
index 00000000000..1ba3821f674
--- /dev/null
+++ b/backends/arm/tosa/dialect/ops/matmul.py
@@ -0,0 +1,56 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm.tosa.dialect.lib import TosaValueError
+from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
+
+from executorch.backends.arm.tosa.specification import (
+    get_context_spec,
+    TosaSpecification,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_fake_tosa_op(
+    "MATMUL(Tensor input1, Tensor input2) -> Tensor",  # schema
+    (
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+    ),  # target TOSA specifications
+)
+def MATMUL(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+    tosa_spec = get_context_spec()
+    """Performs matrix multiplication on two input tensors.
+    Additionally validates TOSA constraints of a MATMUL op.
+    """
+    if x1.dtype != x2.dtype:
+        raise TosaValueError(
+            f"Input tensors must have the same dtype, got {x1.dtype} and {x2.dtype}",
+            op="MATMUL",
+        )
+    if x1.dtype in (torch.int8, torch.int16):
+        if not tosa_spec.support_integer():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support integers", op="MATMUL"
+            )
+        else:
+            dtype = torch.int32
+    elif x1.dtype in (torch.float16, torch.float32):
+        if not tosa_spec.support_float():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support float", op="MATMUL"
+            )
+        else:
+            # float16 supports float16 accumulation as well
+            dtype = torch.float32
+    else:
+        raise TosaValueError(
+            f"Input tensors must be of type int8, float16 or float32, got {x1.dtype}",
+            op="MATMUL",
+        )
+
+    aten_fake_tensor = exir_ops.edge.aten.bmm.default(x1, x2)
+
+    return torch.empty_like(aten_fake_tensor, dtype=dtype)
diff --git a/backends/arm/tosa/dialect/ops/transpose.py b/backends/arm/tosa/dialect/ops/transpose.py
index 9c5aba05394..8d5bf8bac70 100644
--- a/backends/arm/tosa/dialect/ops/transpose.py
+++ b/backends/arm/tosa/dialect/ops/transpose.py
@@ -26,9 +26,9 @@ def TRANSPOSE(a, perms):
     # By utilizing an edge IR passthrough operator we can keep the edge program in
     # channels-first/contiguous and get the desired behavior in the TOSA lowering.
 
-    if len(perms) not in (4, 5):
+    if len(perms) not in (4, 5, 6):
         raise TosaValueError(
-            f"Only 4D and 5D tensors are supported, got {len(perms)}: {perms}",
+            f"Only 4D, 5D and 6D tensors are supported, got {len(perms)}: {perms}",
             op="TRANSPOSE",
         )
 
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
index 3e512847109..6eb1dcbef72 100644
--- a/backends/arm/tosa/partitioner.py
+++ b/backends/arm/tosa/partitioner.py
@@ -4,6 +4,15 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide a partitioner for delegating subgraphs to the TOSA backend.
+
+Implement logic to identify and tag regions of an ``ExportedProgram`` that can
+be delegated to the TOSA backend. Use this module to:
+
+- Partition graphs based on operator support and additional checks.
+- Prune trivial no-op partitions that would lower to empty TOSA graphs.
+- Tag constant data and report reasons for rejected nodes.
+"""
 
 import logging
 from typing import Callable, List, Optional, Sequence, Tuple
@@ -34,14 +43,46 @@
 
 
 def is_noop_clone(node: torch.fx.node.Node) -> bool:
+    """Return True if the node is a no-op ``dim_order_ops._clone_dim_order``.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``dim_order_ops._clone_dim_order.default``
+        in the Edge dialect; otherwise, False.
+
+    """
     return node.target == exir_ops.edge.dim_order_ops._clone_dim_order.default
 
 
 def is_noop_alias_copy(node: torch.fx.Node) -> bool:
+    """Return True if the node is a no-op ``aten.alias_copy``.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``aten.alias_copy.default``; otherwise,
+        False.
+
+    """
     return node.target == exir_ops.edge.aten.alias_copy.default
 
 
 def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool:
+    """Return True if node is a no-op ``dim_order_ops._to_dim_order_copy``.
+
+    Consider the op a no-op when the output dtype equals the input's dtype.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if it targets ``_to_dim_order_copy.default`` and preserves
+        dtype; otherwise, False.
+
+    """
     if node.target != exir_ops.edge.dim_order_ops._to_dim_order_copy.default:
         return False
     else:
@@ -49,6 +90,19 @@ def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool:
 
 
 def is_noop_expand(node: torch.fx.node.Node) -> bool:
+    """Return True if the node is an ``expand_copy`` with all-ones multiples.
+
+    This corresponds to a semantic no-op, since expanding by 1 along every
+    dimension leaves the tensor unchanged.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``aten.expand_copy.default`` and all
+        computed multiples are 1; otherwise, False.
+
+    """
     if node.target != exir_ops.edge.aten.expand_copy.default:
         return False
     else:
@@ -57,11 +111,30 @@ def is_noop_expand(node: torch.fx.node.Node) -> bool:
 
 
 class TOSAPartitioner(Partitioner):
+    """Partition an exported program into TOSA-delegable subgraphs.
+
+    Construct this partitioner for compile specs targeting TOSA. The partition
+    algorithm uses capability checks and optional additional operator-support
+    rules to tag nodes with a delegation tag per subgraph.
+    """
+
     def __init__(
         self,
         compile_spec: TosaCompileSpec,
         additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
     ) -> None:
+        """Initialize the TOSAPartitioner.
+
+        Args:
+            compile_spec (TosaCompileSpec): Parsed compile specifications for
+                TOSA containing the TOSA spec and original list.
+            additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra
+                operator-support checks to apply when partitioning.
+
+        Raises:
+            RuntimeError: If the provided compile spec does not target TOSA.
+
+        """
         self.delegation_spec = DelegationSpec(
             TOSABackend.__name__, compile_spec.to_list()
         )
@@ -70,9 +143,22 @@ def __init__(
         self.tosa_spec = compile_spec.tosa_spec
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # noqa
-        # Run the CapabilityBasedPartitioner to return the largest possible
-        # subgraphs containing the nodes with the tags
+        """Partition the program and tag TOSA-compatible subgraphs.
+
+        Run the FX capability-based partitioner to propose subgraphs, then
+        refine tags by removing boundary-only quantize/dequantize nodes and by
+        rejecting partitions that would lower to no-ops. Emit a detailed report
+        of rejected nodes and their reasons.
+
+        Args:
+            exported_program (ExportedProgram): Program to analyze and
+                partition.
+
+        Returns:
+            PartitionResult: The input program with nodes tagged for delegation
+            and a mapping of partition tags to delegation specs.
 
+        """
         logger.info("TOSAPartitioner::partition")
         partition_tags: dict[str, DelegationSpec] = {}
 
@@ -92,6 +178,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # no
         partition_list = capability_partitioner.propose_partitions()
 
         def reject_partition(reason: str, partition, tag) -> None:
+            """Remove a proposed partition and record the rejection reason.
+
+            Args:
+                reason (str): Human-readable explanation for rejection.
+                partition (object): Proposed partition object from the
+                    capability partitioner.
+                tag (str): Delegation tag associated with the partition.
+
+            """
             for node in partition.nodes:
                 if "delegation_tag" in node.meta:
                     del node.meta["delegation_tag"]
@@ -105,6 +200,16 @@ def reject_partition(reason: str, partition, tag) -> None:
             tag = f"tag{partition.id}"
 
             def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
+                """Return True if the node currently belongs to the partition ``tag``.
+
+                Args:
+                    node (torch.fx.Node): FX node to check.
+                    tag (str): Delegation tag identifying the partition.
+
+                Returns:
+                    bool: True if the node carries the matching delegation tag.
+
+                """
                 return (
                     "delegation_tag" in node.meta and node.meta["delegation_tag"] == tag
                 )
@@ -113,8 +218,8 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
                 node.meta["delegation_tag"] = tag
                 partition_tags[tag] = self.delegation_spec
 
-            # De-tag outmost q-nodes upwards and dq-nodes downwards.
-            # De-tag if at least one input/ output is not part of partition.
+            # De-tag outermost q-nodes upwards and dq-nodes downwards.
+            # De-tag if at least one input/output is not part of the partition.
             for node in exported_program.graph_module.graph.nodes:
                 if not is_partitioned(node):
                     continue
@@ -175,15 +280,41 @@ def ops_to_not_decompose(
         self,
         ep: ExportedProgram,
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        """Return operators and a filter that should not be decomposed.
+
+        Provide a base set of ops to preserve as-is and a predicate that keeps
+        certain activations whole when surrounded by quantize/dequantize ops in
+        a quantized graph. This helps downstream TOSA lowering and delegation.
+
+        Args:
+            ep (ExportedProgram): Program used to infer target-specific policy.
+
+        Returns:
+            Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+                A list of op overloads to keep intact, and an optional filter
+                function that returns True when an op should not be decomposed.
+
+        """
         ops_to_not_decompose_if_quant_op = [
             torch.ops.aten.hardsigmoid.default,
             torch.ops.aten.hardswish.default,
         ]
 
         def filter_fn(node: torch.fx.Node) -> bool:
-            # This function filters for operators to not decompose where:
-            #   - It's target is in ops_to_not_decompose_if_quant_op list.
-            #   - All it's inputs/outputs are quantize operators.
+            """Return True to keep selected ops intact inside quantized regions.
+
+            The predicate holds when the target is in
+            ``ops_to_not_decompose_if_quant_op`` and all inputs/outputs are
+            quantize/dequantize ops, indicating a quantized activation that
+            should not be decomposed.
+
+            Args:
+                node (torch.fx.Node): FX node to evaluate.
+
+            Returns:
+                bool: True to keep the op intact; otherwise, False.
+
+            """
             dq = torch.ops.quantized_decomposed.dequantize_per_tensor.default
             q = torch.ops.quantized_decomposed.quantize_per_tensor.default
 
@@ -204,7 +335,7 @@ def filter_fn(node: torch.fx.Node) -> bool:
 
                 return should_not_decompose
 
-            # Be default, do not decompose the operator
+            # By default, do not decompose the operator
             return True
 
         ops_to_not_decompose = [
diff --git a/backends/arm/util/_factory.py b/backends/arm/util/_factory.py
new file mode 100644
index 00000000000..23d8215fc9b
--- /dev/null
+++ b/backends/arm/util/_factory.py
@@ -0,0 +1,59 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
+from executorch.backends.arm.quantizer import (
+    EthosUQuantizer,
+    TOSAQuantizer,
+    VgfQuantizer,
+)
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+def parse_compile_spec(compile_specs: list[CompileSpec]) -> ArmCompileSpec:
+    output_format = None
+    for spec in compile_specs:
+        if spec.key == "output_format":
+            output_format = spec.value.decode()
+            break
+    else:
+        raise ValueError("Compile spec without output format.")
+    if output_format == TosaCompileSpec.get_output_format():
+        return TosaCompileSpec.from_list(compile_specs)
+    if output_format == EthosUCompileSpec.get_output_format():
+        return EthosUCompileSpec.from_list(compile_specs)
+    if output_format == VgfCompileSpec.get_output_format():
+        return VgfCompileSpec.from_list(compile_specs)
+    raise ValueError(f"Unknown output format {output_format}")
+
+
+def create_partitioner(
+    compile_spec: ArmCompileSpec,
+    additional_checks: list[OperatorSupportBase] | None = None,
+):
+    if isinstance(compile_spec, TosaCompileSpec):
+        return TOSAPartitioner(compile_spec, additional_checks)
+    elif isinstance(compile_spec, EthosUCompileSpec):
+        return EthosUPartitioner(compile_spec, additional_checks)
+    elif isinstance(compile_spec, VgfCompileSpec):
+        return VgfPartitioner(compile_spec, additional_checks)
+    else:
+        raise ValueError("compile spec doesn't target any Arm Partitioner")
+
+
+def create_quantizer(compile_spec: ArmCompileSpec):
+    if isinstance(compile_spec, TosaCompileSpec):
+        return TOSAQuantizer(compile_spec)
+    elif isinstance(compile_spec, EthosUCompileSpec):
+        return EthosUQuantizer(compile_spec)
+    elif isinstance(compile_spec, VgfCompileSpec):
+        return VgfQuantizer(compile_spec)
+    else:
+        raise ValueError("compile spec doesn't target any Arm Quantizer")
diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py
index cbfa337ab09..8c36128cea8 100644
--- a/backends/arm/util/arm_model_evaluator.py
+++ b/backends/arm/util/arm_model_evaluator.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -30,7 +29,139 @@
 logger.setLevel(logging.INFO)
 
 
+# ImageNet 224x224 transforms (Resize->CenterCrop->ToTensor->Normalize)
+# If future models require different preprocessing, extend this helper accordingly.
+def _get_imagenet_224_transforms():
+    """Return standard ImageNet 224x224 preprocessing transforms."""
+    return transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220]),
+        ]
+    )
+
+
+def _build_calibration_loader(
+    dataset: datasets.ImageFolder, max_items: int
+) -> DataLoader:
+    """Return a DataLoader over a deterministic, shuffled subset of size <= max_items.
+
+    Shuffles with seed: ARM_EVAL_CALIB_SEED (int) or default 1337; then selects first k and
+    sorts indices to keep enumeration order stable while content depends on seed.
+    """
+    k = min(max_items, len(dataset))
+    seed_env = os.getenv("ARM_EVAL_CALIB_SEED")
+    default_seed = 1337
+    if seed_env is not None:
+        try:
+            seed = int(seed_env)
+        except ValueError:
+            logger.warning(
+                "ARM_EVAL_CALIB_SEED is not an int (%s); using default seed %d",
+                seed_env,
+                default_seed,
+            )
+            seed = default_seed
+    else:
+        seed = default_seed
+    rng = random.Random(seed)
+    indices = list(range(len(dataset)))
+    rng.shuffle(indices)
+    selected = sorted(indices[:k])
+    return torch.utils.data.DataLoader(
+        torch.utils.data.Subset(dataset, selected), batch_size=1, shuffle=False
+    )
+
+
+def _load_imagenet_folder(directory: str) -> datasets.ImageFolder:
+    """Shared helper to load an ImageNet-layout folder.
+
+    Raises FileNotFoundError for a missing directory early to aid debugging.
+    """
+    directory_path = Path(directory)
+    if not directory_path.exists():
+        raise FileNotFoundError(f"Directory: {directory} does not exist.")
+    transform = _get_imagenet_224_transforms()
+    return datasets.ImageFolder(directory_path, transform=transform)
+
+
 class GenericModelEvaluator:
+    """Base evaluator computing quantization error metrics and optional compression ratio.
+
+    Subclasses can extend: provide calibration (get_calibrator) and override evaluate()
+    to add domain specific metrics (e.g. top-1 / top-5 accuracy).
+    """
+
+    @staticmethod
+    def evaluate_topk(
+        model: Module,
+        dataset: datasets.ImageFolder,
+        batch_size: int,
+        topk: int = 5,
+        log_every: int = 50,
+    ) -> Tuple[float, float]:
+        """Evaluate model top-1 / top-k accuracy.
+
+        Args:
+            model: Torch module (should be in eval() mode prior to call).
+            dataset: ImageFolder style dataset.
+            batch_size: Batch size for evaluation.
+            topk: Maximum k for accuracy (default 5).
+            log_every: Log running accuracy every N batches.
+        Returns:
+            (top1_accuracy, topk_accuracy)
+        """
+        # Some exported / quantized models (torchao PT2E) disallow direct eval()/train().
+        # Try to switch to eval mode, but degrade gracefully if unsupported.
+        try:
+            model.eval()
+        except NotImplementedError:
+            # Attempt to enable train/eval overrides if torchao helper is present.
+            try:
+                from torchao.quantization.pt2e.utils import (  # type: ignore
+                    allow_exported_model_train_eval,
+                )
+
+                allow_exported_model_train_eval(model)
+                try:
+                    model.eval()
+                except Exception:
+                    logger.debug(
+                        "Model eval still not supported after allow_exported_model_train_eval; proceeding without explicit eval()."
+                    )
+            except Exception:
+                logger.debug(
+                    "Model eval() unsupported and torchao allow_exported_model_train_eval not available; proceeding."
+                )
+        loaded_dataset = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+        top1_correct = 0
+        topk_correct = 0
+        total = 0
+        with torch.inference_mode():  # disable autograd + some backend optimizations
+            for i, (image, target) in enumerate(loaded_dataset):
+                prediction = model(image)
+                topk_indices = torch.topk(prediction, k=topk, dim=1).indices
+                # target reshaped for broadcasting
+                target_view = target.view(-1, 1)
+                top1_correct += (topk_indices[:, :1] == target_view).sum().item()
+                topk_correct += (topk_indices == target_view).sum().item()
+                batch_sz = image.size(0)
+                total += batch_sz
+                if (i + 1) % log_every == 0 or total == len(dataset):
+                    logger.info(
+                        "Eval progress: %d / %d  top1=%.4f top%d=%.4f",
+                        total,
+                        len(dataset),
+                        top1_correct / total,
+                        topk,
+                        topk_correct / total,
+                    )
+        top1_accuracy = top1_correct / len(dataset)
+        topk_accuracy = topk_correct / len(dataset)
+        return top1_accuracy, topk_accuracy
+
     REQUIRES_CONFIG = False
 
     def __init__(
@@ -53,12 +184,13 @@ def __init__(
             self.tosa_output_path = ""
 
     def get_model_error(self) -> defaultdict:
-        """
-        Returns a dict containing the following metrics between the outputs of the FP32 and INT8 model:
-        - Maximum error
-        - Maximum absolute error
-        - Maximum percentage error
-        - Mean absolute error
+        """Return per-output quantization error statistics.
+
+        Metrics (lists per output tensor):
+            max_error
+            max_absolute_error
+            max_percentage_error (safe-divided; zero fp32 elements -> 0%)
+            mean_absolute_error
         """
         fp32_outputs, _ = tree_flatten(self.fp32_model(*self.example_input))
         int8_outputs, _ = tree_flatten(self.int8_model(*self.example_input))
@@ -67,7 +199,12 @@ def get_model_error(self) -> defaultdict:
 
         for fp32_output, int8_output in zip(fp32_outputs, int8_outputs):
             difference = fp32_output - int8_output
-            percentage_error = torch.div(difference, fp32_output) * 100
+            # Avoid divide by zero: elements where fp32 == 0 produce 0% contribution
+            percentage_error = torch.where(
+                fp32_output != 0,
+                difference / fp32_output * 100,
+                torch.zeros_like(difference),
+            )
             model_error_dict["max_error"].append(torch.max(difference).item())
             model_error_dict["max_absolute_error"].append(
                 torch.max(torch.abs(difference)).item()
@@ -132,77 +269,116 @@ def __init__(
 
     @staticmethod
     def __load_dataset(directory: str) -> datasets.ImageFolder:
-        directory_path = Path(directory)
-        if not directory_path.exists():
-            raise FileNotFoundError(f"Directory: {directory} does not exist.")
-
-        transform = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220]
-                ),
-            ]
-        )
-        return datasets.ImageFolder(directory_path, transform=transform)
+        return _load_imagenet_folder(directory)
 
     @staticmethod
     def get_calibrator(training_dataset_path: str) -> DataLoader:
         dataset = MobileNetV2Evaluator.__load_dataset(training_dataset_path)
-        rand_indices = random.sample(range(len(dataset)), k=1000)
+        return _build_calibration_loader(dataset, 1000)
 
-        # Return a subset of the dataset to be used for calibration
-        return torch.utils.data.DataLoader(
-            torch.utils.data.Subset(dataset, rand_indices),
-            batch_size=1,
-            shuffle=False,
+    @classmethod
+    def from_config(
+        cls,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        config: dict[str, Any],
+    ) -> "MobileNetV2Evaluator":
+        """Factory constructing evaluator from a config dict.
+
+        Expected keys: batch_size, validation_dataset_path
+        """
+        return cls(
+            model_name,
+            fp32_model,
+            int8_model,
+            example_input,
+            tosa_output_path,
+            batch_size=config["batch_size"],
+            validation_dataset_path=config["validation_dataset_path"],
         )
 
-    def __evaluate_mobilenet(self) -> Tuple[float, float]:
+    def evaluate(self) -> dict[str, Any]:
+        # Load dataset and compute top-1 / top-5
         dataset = MobileNetV2Evaluator.__load_dataset(self.__validation_set_path)
-        loaded_dataset = DataLoader(
-            dataset,
-            batch_size=self.__batch_size,
-            shuffle=False,
+        top1_correct, top5_correct = GenericModelEvaluator.evaluate_topk(
+            self.int8_model, dataset, self.__batch_size, topk=5
         )
+        output = super().evaluate()
 
-        top1_correct = 0
-        top5_correct = 0
+        output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct}
+        return output
 
-        for i, (image, target) in enumerate(loaded_dataset):
-            prediction = self.int8_model(image)
-            top1_prediction = torch.topk(prediction, k=1, dim=1).indices
-            top5_prediction = torch.topk(prediction, k=5, dim=1).indices
 
-            top1_correct += (top1_prediction == target.view(-1, 1)).sum().item()
-            top5_correct += (top5_prediction == target.view(-1, 1)).sum().item()
+class DeiTTinyEvaluator(GenericModelEvaluator):
+    REQUIRES_CONFIG = True
 
-            logger.info("Iteration: {}".format((i + 1) * self.__batch_size))
-            logger.info(
-                "Top 1: {}".format(top1_correct / ((i + 1) * self.__batch_size))
-            )
-            logger.info(
-                "Top 5: {}".format(top5_correct / ((i + 1) * self.__batch_size))
-            )
+    def __init__(
+        self,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        batch_size: int,
+        validation_dataset_path: str,
+    ) -> None:
+        super().__init__(
+            model_name, fp32_model, int8_model, example_input, tosa_output_path
+        )
+        self.__batch_size = batch_size
+        self.__validation_set_path = validation_dataset_path
 
-        top1_accuracy = top1_correct / len(dataset)
-        top5_accuracy = top5_correct / len(dataset)
+    @staticmethod
+    def __load_dataset(directory: str) -> datasets.ImageFolder:
+        return _load_imagenet_folder(directory)
 
-        return top1_accuracy, top5_accuracy
+    @staticmethod
+    def get_calibrator(training_dataset_path: str) -> DataLoader:
+        dataset = DeiTTinyEvaluator.__load_dataset(training_dataset_path)
+        return _build_calibration_loader(dataset, 1000)
+
+    @classmethod
+    def from_config(
+        cls,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        config: dict[str, Any],
+    ) -> "DeiTTinyEvaluator":
+        """Factory constructing evaluator from a config dict.
+
+        Expected keys: batch_size, validation_dataset_path
+        """
+        return cls(
+            model_name,
+            fp32_model,
+            int8_model,
+            example_input,
+            tosa_output_path,
+            batch_size=config["batch_size"],
+            validation_dataset_path=config["validation_dataset_path"],
+        )
 
     def evaluate(self) -> dict[str, Any]:
-        top1_correct, top5_correct = self.__evaluate_mobilenet()
+        # Load dataset and compute top-1 / top-5
+        dataset = DeiTTinyEvaluator.__load_dataset(self.__validation_set_path)
+        top1, top5 = GenericModelEvaluator.evaluate_topk(
+            self.int8_model, dataset, self.__batch_size, topk=5
+        )
         output = super().evaluate()
-
-        output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct}
+        output["metrics"]["accuracy"] = {"top-1": top1, "top-5": top5}
         return output
 
 
 evaluators: dict[str, type[GenericModelEvaluator]] = {
     "generic": GenericModelEvaluator,
     "mv2": MobileNetV2Evaluator,
+    "deit_tiny": DeiTTinyEvaluator,
 }
 
 
@@ -223,6 +399,10 @@ def evaluator_calibration_data(
             return evaluator.get_calibrator(
                 training_dataset_path=config["training_dataset_path"]
             )
+        if evaluator is DeiTTinyEvaluator:
+            return evaluator.get_calibrator(
+                training_dataset_path=config["training_dataset_path"]
+            )
         else:
             raise RuntimeError(f"Unknown evaluator: {evaluator_name}")
 
@@ -238,30 +418,30 @@ def evaluate_model(
 ) -> None:
     evaluator = evaluators[evaluator_name]
 
-    # Get the path of the TOSA flatbuffer that is dumped
     intermediates_path = Path(intermediates)
     tosa_paths = list(intermediates_path.glob("*.tosa"))
 
     if evaluator.REQUIRES_CONFIG:
         assert evaluator_config is not None
-
         config_path = Path(evaluator_config)
         with config_path.open() as f:
             config = json.load(f)
 
-        if evaluator == MobileNetV2Evaluator:
-            mv2_evaluator = cast(type[MobileNetV2Evaluator], evaluator)
-            init_evaluator: GenericModelEvaluator = mv2_evaluator(
+        # Prefer a subclass provided from_config if available.
+        if hasattr(evaluator, "from_config"):
+            factory = cast(Any, evaluator.from_config)  # type: ignore[attr-defined]
+            init_evaluator = factory(
                 model_name,
                 model_fp32,
                 model_int8,
                 example_inputs,
                 str(tosa_paths[0]),
-                batch_size=config["batch_size"],
-                validation_dataset_path=config["validation_dataset_path"],
+                config,
             )
         else:
-            raise RuntimeError(f"Unknown evaluator {evaluator_name}")
+            raise RuntimeError(
+                f"Evaluator {evaluator_name} requires config but does not implement from_config()"
+            )
     else:
         init_evaluator = evaluator(
             model_name, model_fp32, model_int8, example_inputs, str(tosa_paths[0])
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 6c497d5bec4..765ddcd581d 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -24,6 +24,7 @@
 from executorch.backends.cadence.aot.quantizer.quantizer import (
     CadenceDefaultQuantizer,
     CadenceQuantizer,
+    CadenceW8A32MixedQuantizer,
 )
 from executorch.backends.cadence.aot.utils import (
     get_default_memory_config,
@@ -59,6 +60,7 @@ def trace(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
     dump_graphs: bool = False,
+    quantizer: Optional[CadenceQuantizer] = None,
 ) -> ExportedProgram:
     """
     Trace the model with export and return an ExportedProgram.
@@ -73,6 +75,12 @@ def trace(
         torch.ops.aten.rms_norm.default,
     ]
 
+    if isinstance(quantizer, CadenceW8A32MixedQuantizer):
+        ops_to_keep += [
+            torch.ops.aten.gru.input,
+            torch.ops.aten.gru.data,
+        ]
+
     program = trace_fn(
         model, inputs, is_qat=False, strict=True, ops_to_keep=ops_to_keep
     )
@@ -99,7 +107,7 @@ def prepare_pt2(
     Returns a GraphModule with the prepared model.
     """
 
-    traced_program = trace(model, inputs, dump_graphs=dump_graphs)
+    traced_program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer)
     prepared_program = prepare_traced_pt2(
         traced_program, quantizer, dump_graphs=dump_graphs
     )
@@ -184,7 +192,7 @@ def get_fake_quant_model(
     # Make the model inference mode by calling model.eval()
     model.eval()
 
-    program = trace(model, inputs, dump_graphs=dump_graphs)
+    program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer)
 
     if dump_graphs:
         logging.info("Graph after trace:")
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index bcab980abd6..3bdbb33d59b 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -548,3 +548,18 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_linear_out
+
+- func: cadence::quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_conv_out
+
+- func: cadence::quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_gru_out
diff --git a/backends/cadence/aot/graph_builder.py b/backends/cadence/aot/graph_builder.py
index 2cfd7900e8e..f609ba55472 100644
--- a/backends/cadence/aot/graph_builder.py
+++ b/backends/cadence/aot/graph_builder.py
@@ -44,12 +44,12 @@ class GraphBuilder(ExportPass):
         gm = builder.get_graph_module()
     """
 
-    def __init__(self) -> None:
+    def __init__(self, fake_tensor_mode: Optional[FakeTensorMode] = None) -> None:
         self.exporter = ExportPass()
         self.tracer: ExportPass.ExportTracer = self.ExportTracer(
             self, torch.fx.graph.CodeGen()
         )
-        self.fake_tensor_mode = FakeTensorMode(
+        self.fake_tensor_mode: FakeTensorMode = fake_tensor_mode or FakeTensorMode(
             allow_fallback_kernels=False,
             allow_non_fake_inputs=True,
         )
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index e3009163d62..f827488adfb 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -320,7 +320,7 @@
     "float out_scale, int out_zero_point) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False) -> (Tensor X)"
 )
 lib.define(
@@ -329,7 +329,7 @@
     "Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False) -> (Tensor out)"
 )
 lib.define(
-    "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, "
+    "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], bool ceil_mode=False, "
     "bool count_include_pad=True, int? divisor_override=None, Tensor? in_zero_point=None, bool channel_last=False) -> (Tensor out)"
 )
 lib.define(
@@ -514,7 +514,7 @@
     "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
@@ -525,7 +525,7 @@
     "Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, "
+    "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], "
     "bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, "
     "Tensor? in_zero_point=None, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
@@ -564,6 +564,29 @@
     "_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantized_w8a32_linear(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor"
+)
+lib.define(
+    "quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantized_w8a32_conv(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor"
+)
+lib.define(
+    "quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale) -> Tensor"
+)
+
+lib.define(
+    "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+
 # Custom ops with aten namespace. Need to specify the lib var as FRAGMENT type as aten library is already defined
 aten_lib = Library("aten", "FRAGMENT")
 aten_lib.define(
@@ -2302,6 +2325,28 @@ def transposed_im2row_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_embedding_byte")
+def quantized_embedding_byte_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    assert not pruned_weights
+    assert len(weight.shape) == 2
+    assert 1 <= len(weight_scales.shape) <= 2
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[-1]
+        assert weight.shape[1] % num_groups == 0
+
+    if weight_zero_points is not None:
+        assert weight_zero_points.shape == weight_scales.shape
+
+    assert 1 <= len(indices.shape) <= 2
+    return torch.empty(*indices.shape, weight.shape[1], dtype=torch.float32)
+
+
 @register_fake("cadence::where_Scalar")
 def where_Scalar_meta(
     condition: torch.Tensor,
@@ -2562,3 +2607,67 @@ def quantized_softmax_per_tensor_meta(
     out_zero_point: int,
 ) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_linear")
+def quantized_w8a32_linear_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    w_scale: float,
+    bias: torch.Tensor,
+    b_scale: float,
+) -> torch.Tensor:
+    # src comes in shape [leading_dims, in_dim]
+    # weight comes in shape [in_dim, out_dim]
+    # output comes in empty with shape [leading_dims, out_dim]
+    src_shape = list(src.shape)
+    weight_shape = weight.shape
+    assert len(weight_shape) == 2
+    assert src_shape[-1] == weight_shape[-1]
+    src_shape[-1] = weight_shape[0]
+    return src.new_empty(src_shape, dtype=src.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_conv")
+def quantized_w8a32_conv_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    w_scale: float,
+    bias: torch.Tensor,
+    b_scale: float,
+) -> torch.Tensor:
+    # src comes in shape [batch, in_channel, in_length]
+    # weight comes in shape [out_ch, in_ch, kernel_dim]
+    # output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
+    assert len(src.shape) == 3
+
+    kernel_size, out_channels, in_channels = weight.shape
+    assert in_channels == src.shape[-1]
+
+    # Compute the output tensor size
+    output_size = get_conv1d_output_size(
+        src.permute(0, 2, 1).shape,
+        out_channels,
+        stride=1,
+        padding=0,
+        dilation=1,
+        kernel_size=kernel_size,
+        channel_last=False,
+    )
+    return src.new_empty(output_size, dtype=src.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_gru")
+def quantized_w8a32_gru_meta(
+    inputs: torch.Tensor,
+    hidden: torch.Tensor,
+    weights_inputs: torch.Tensor,
+    w_i_scale: float,
+    weights_hidden: torch.Tensor,
+    w_h_scale: float,
+    bias_inputs: torch.Tensor,
+    b_i_scale: float,
+    bias_hidden: torch.Tensor,
+    b_h_scale: float,
+) -> torch.Tensor:
+    return inputs.new_empty((2, hidden.shape[-1]), dtype=inputs.dtype)
diff --git a/backends/cadence/aot/program_builder.py b/backends/cadence/aot/program_builder.py
index 862ba4e977c..46d730b68ff 100644
--- a/backends/cadence/aot/program_builder.py
+++ b/backends/cadence/aot/program_builder.py
@@ -12,6 +12,7 @@
 from torch import Tensor
 from torch._export.verifier import Verifier
 from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.export import ExportedProgram
 from torch.export.exported_program import ModuleCallEntry, ModuleCallSignature
 from torch.export.graph_signature import (
@@ -37,6 +38,7 @@ def __init__(
         self,
         mode: Optional[IrMode] = None,
         _core_aten_ops_exception_list: Optional[list[OpOverload]] = None,
+        fake_tensor_mode: Optional[FakeTensorMode] = None,
     ) -> None:
         self.input_specs: list[InputSpec] = []
         self.output_specs: list[OutputSpec] = []
@@ -46,7 +48,7 @@ def __init__(
         self._core_aten_ops_exception_list: list[OpOverload] = (
             _core_aten_ops_exception_list or []
         )
-        super().__init__()
+        super().__init__(fake_tensor_mode=fake_tensor_mode)
 
     def insert_input_spec(
         self, target: str, input_kind: InputKind, value: Tensor
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index 0461c03ccb7..2fa0f794e3c 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -24,6 +24,9 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32ConvPattern,
+    MixedW8A32GruPattern,
+    MixedW8A32LinearPattern,
     ReluPattern0,
     ReluPattern1,
     SoftmaxPattern,
@@ -390,6 +393,29 @@ def get_args_and_kwargs_relu(
     return args, kwargs
 
 
+def get_args_and_kwargs_mixed_w8a32_linear(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    w_scale_ = dequants_weights[0].args[1]
+    b_scale_ = dequants_biases[0].args[1]
+
+    args = (
+        other_inputs[0],
+        weights_inputs[0],
+        w_scale_,
+        bias_inputs[0],
+        b_scale_,
+    )
+    kwargs = {}
+
+    return args, kwargs
+
+
 def get_args_and_kwargs_softmax(
     graph_module: GraphModule,
     inputs_inputs: List[fx.Node],
@@ -454,6 +480,87 @@ def get_args_and_kwargs_softmax(
         out_zero_point_tensor,
     )
     kwargs = {}
+
+    return args, kwargs
+
+
+def get_args_and_kwargs_mixed_w8a32_conv(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Stride, padding, dilation, groups not supported yet
+    if len(op_node.args) > 3:
+        assert op_node.args[3] == [1]  # Stride
+    if len(op_node.args) > 4:
+        assert op_node.args[4] == [0]  # Padding
+    if len(op_node.args) > 5:
+        assert op_node.args[5] == [1]  # Dilation
+    if len(op_node.args) > 6:
+        assert op_node.args[6] == 1  # Groups
+
+    assert len(dequants_weights) == 1
+    assert len(dequants_biases) == 1
+    W_scale_ = dequants_weights[0].args[1]
+    B_scale_ = dequants_biases[0].args[1]
+
+    transposed_inputs = graph_module.graph.call_function(
+        torch.ops.aten.permute.default,
+        (other_inputs[0], [0, 2, 1]),  # NCL -> NLC
+    )
+    transposed_weights = graph_module.graph.call_function(
+        torch.ops.aten.permute.default,
+        (weights_inputs[0], [2, 0, 1]),  # NCL -> NLC
+    )
+
+    args = (
+        transposed_inputs,
+        transposed_weights,
+        W_scale_,
+        bias_inputs[0],
+        B_scale_,
+    )
+    kwargs = {}
+
+    return args, kwargs
+
+
+def get_args_and_kwargs_mixed_w8a32_gru(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Stride, padding, dilation, groups not supported yet
+
+    assert len(dequants_weights) == 2
+    assert len(dequants_biases) == 2
+    w_i_scale = dequants_weights[0].args[1]
+    w_h_scale = dequants_weights[1].args[1]
+    b_i_scale = dequants_biases[0].args[1]
+    b_h_scale = dequants_biases[1].args[1]
+
+    args = (
+        other_inputs[0],
+        other_inputs[1],
+        weights_inputs[0],
+        w_i_scale,
+        weights_inputs[1],
+        w_h_scale,
+        bias_inputs[0],
+        b_i_scale,
+        bias_inputs[1],
+        b_h_scale,
+    )
+    kwargs = {}
+
     return args, kwargs
 
 
@@ -617,6 +724,35 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             quant_node,
                             op_node,
                         )
+                    elif isinstance(pattern, MixedW8A32LinearPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_linear(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                        )
+                    elif isinstance(pattern, MixedW8A32ConvPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_conv(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                            op_node,
+                        )
+                    elif isinstance(pattern, MixedW8A32GruPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_gru(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                            op_node,
+                        )
 
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 4eae55502d7..2452cfdcfea 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -524,7 +524,6 @@ def partition_types(self) -> List[OpOverload]:
 
 
 class SoftmaxPattern(QuantizationPattern):
-
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten._softmax.default]
 
@@ -546,3 +545,176 @@ def get_anchors(
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_softmax.default
+
+
+class MixedW8A32LinearPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.linear.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-ignore[29]
+        linear_layer = fused_partition[0].nodes[-1]
+
+        # Bail if the arguments have different shapes than expected
+        if len(linear_layer.args) != 3 or len(linear_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+
+        input_node = linear_layer.args[0]
+        input_shape = input_node.meta["tensor_meta"].shape
+
+        # Bail if the weights are not multiple of 4 (SIMD)
+        if input_shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+        # Currenly only supporting vector-matrix multiplication
+        if len(input_shape) > 0 and input_shape[-2] != 1:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                weights=[(linear_layer, 1)],
+                biases=[(linear_layer, 2)],
+                output=[],
+                others=[(linear_layer, 0)],
+            ),
+            linear_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_linear.default
+
+
+class MixedW8A32ConvPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv1d.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-ignore[29]
+        conv_layer = fused_partition[0].nodes[-1]
+
+        # Bail if the arguments have different shapes than expected
+        # Stride, padding, dilation and groups are not supported
+        if len(conv_layer.args) != 3 or len(conv_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                conv_layer,
+            )
+
+        cnn_weights = conv_layer.args[1]
+        if hasattr(cnn_weights.meta, "tensor_meta"):
+            cnn_weights_shape = cnn_weights.meta["tensor_meta"].shape
+            # Bail if the channels are not multiple of 4 (SIMD)
+            if cnn_weights_shape[0] % 4 != 0:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+            if cnn_weights_shape[1] % 4 != 0:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+            # Bail if the kernel size is not 3
+            if cnn_weights_shape[2] != 3:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                weights=[(conv_layer, 1)],
+                biases=[(conv_layer, 2)],
+                output=[],
+                others=[(conv_layer, 0)],
+            ),
+            conv_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_conv.default
+
+
+class MixedW8A32GruPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.gru.input]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        gru_layer = fused_partition[0].nodes[-1]
+        if len(gru_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+
+        # Bail if input or states are not multiple of 4 (SIMD)
+        if gru_layer.args[0].meta["tensor_meta"].shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+        if gru_layer.args[1].meta["tensor_meta"].shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+
+        class Wrapper:  # noqa: B903
+            def __init__(self, args, meta):
+                self.args = args
+                self.meta = meta
+
+        wrapper = Wrapper(tuple(gru_layer.args[2]), gru_layer.meta)
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                # pyre-fixme[6]: Expected `List[Tuple[Node, int]]` but got `List[Tuple[Wrapper, int]]`.
+                weights=[(wrapper, 0), (wrapper, 1)],
+                # pyre-fixme[6]: Expected `List[Union[Tuple[Node, int], Tuple[Node, int, DerivedQuantizationSpec]]]` but got `List[Tuple[Wrapper, int]]`.
+                biases=[(wrapper, 2), (wrapper, 3)],
+                output=[],
+                others=[(gru_layer, 0), (gru_layer, 1)],
+            ),
+            gru_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_gru.default
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index 536b28f5cec..d4af074c475 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -24,6 +24,9 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32ConvPattern,
+    MixedW8A32GruPattern,
+    MixedW8A32LinearPattern,
     QuantizationPattern,
     ReluPattern0,
     ReluPattern1,
@@ -109,6 +112,13 @@
     None,
 )
 
+qconfig_A32W8sym = QuantizationConfig(
+    input_activation=None,
+    output_activation=None,
+    weight=wgt_qspec_sym8s,
+    bias=wgt_qspec_sym8s,
+)
+
 
 class CadenceAtenQuantizer(Quantizer):
     def __init__(
@@ -302,6 +312,26 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         super().__init__(quantizers)
 
 
+class CadenceW8A32MixedQuantizer(CadenceQuantizer):
+    """
+    Quantizer for mixed quantization, 8 bit weights and 32 bit activations
+    TODO: Experimental quantizer, not yet well supported in OSS
+    """
+
+    def __init__(self) -> None:
+        quantizers = []
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32LinearPattern(), qconfig_A32W8sym)
+        )
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32ConvPattern(), qconfig_A32W8sym)
+        )
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32GruPattern(), qconfig_A32W8sym)
+        )
+        super().__init__(quantizers)
+
+
 class CadenceWithSoftmaxQuantizer(CadenceQuantizer):
     """
     Quantizer including A16 softmax
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 312bed89315..ed9bb438a9e 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -62,7 +62,7 @@ def quantize_per_tensor(
     ]
     if dtype not in supported_quant_types:
         raise ValueError(
-            f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_quant_types}"
+            f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_quant_types}"
         )
 
     return torch.ops.quantized_decomposed.quantize_per_tensor(
@@ -264,7 +264,7 @@ def quantized_linear_common(
     supported_dtypes = [torch.int8, torch.uint8, torch.int32]
     if dtype not in supported_dtypes:
         raise ValueError(
-            f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_dtypes}"
+            f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_dtypes}"
         )
 
     out = torch.nn.functional.linear(
@@ -330,8 +330,8 @@ def variant(
                 if out_shift.numel() != 1:
                     raise ValueError("out_shift must be a scalar")
 
-                if out_shift.dtype != torch.int64:
-                    raise ValueError("out_shift must be an int64")
+                if out_shift.dtype != torch.int32:
+                    raise ValueError("out_shift must be an int32")
 
                 _out_shift = int(out_shift.item())
                 _out_multiplier = int(out_multiplier[0].item())
@@ -427,25 +427,27 @@ def quantized_matmul(
         - out_multiplier (int): The multiplier used to scale the output
         - out_shift (int): The shift used to scale the output
         - out_zero_point (int): The quantized mapping of zero for the output
-        - transposed (bool): Whether to transpose the weight tensor
+        - transposed (bool): Whether Y is transposed.
     """
     if bias is not None and not torch.all(bias == 0):
         raise ValueError("bias must be None or all zeros since unused in out variant")
 
-    # Looks weird, but quantized linear assumes weights are pre-transposed,
-    # hence we transpose only if `transposed` is False.
-    if not transposed:
-        Y = Y.T
+    if transposed:
+        Y = Y.transpose(-1, -2)
 
-    return quantized_linear_common(
-        X,
-        Y,
-        bias or torch.zeros(1, dtype=torch.int32),
-        X_zero_point,
-        Y_zero_point,
-        out_multiplier,
-        out_shift,
+    out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift))
+
+    out = torch.matmul(
+        (X - X_zero_point).float(),
+        (Y - Y_zero_point).float(),
+    )
+    return quantize_per_tensor(
+        out,
+        out_scale,
         out_zero_point,
+        torch.iinfo(X.dtype).min,
+        torch.iinfo(X.dtype).max,
+        X.dtype,
     )
 
 
@@ -960,6 +962,7 @@ def convolution(
     _stride: tuple[int, int] | int = stride
     _padding: tuple[int, int] | int = padding
     _dilation: tuple[int, int] | int = dilation
+
     if conv_is_1d:
         conv = torch.nn.functional.conv1d
         _stride = stride[0]
@@ -978,6 +981,64 @@ def convolution(
     return conv_out
 
 
+@impl(m, "transposed_convolution")
+def transposed_convolution(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    dilation: tuple[int, int],
+    output_padding: tuple[int, int],
+    groups: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+
+    conv_is_1d = len(input_tensor.shape) == 3
+    if channel_last:
+        if conv_is_1d:
+            input_tensor = input_tensor.movedim(-1, 1).contiguous()
+            if len(weight.shape) != 3:
+                raise ValueError("Weight tensor must be 3D if input is 3D")
+            weight = weight.movedim(-1, 1).contiguous()
+        else:
+            input_tensor = input_tensor.movedim(-1, -3)
+            if len(weight.shape) != 4:
+                raise ValueError("Weight tensor must be 4D if input is nd > 3")
+            weight = torch.permute(weight, (0, -1, 1, 2)).contiguous()
+
+    _stride: tuple[int, int] | int = stride
+    _padding: tuple[int, int] | int = padding
+    _dilation: tuple[int, int] | int = dilation
+    _output_padding: tuple[int, int] | int = output_padding
+    if conv_is_1d:
+        conv = torch.nn.functional.conv_transpose1d
+        _stride = stride[0]
+        _padding = padding[0]
+        _dilation = dilation[0]
+        _output_padding = output_padding[0]
+    else:
+        conv = torch.nn.functional.conv_transpose2d
+
+    conv_out = conv(
+        input_tensor,
+        weight,
+        bias,
+        _stride,
+        _padding,
+        _output_padding,
+        groups,
+        _dilation,
+    )
+    if channel_last:
+        if conv_is_1d:
+            conv_out = conv_out.movedim(1, -1).contiguous()
+        else:
+            conv_out = conv_out.movedim(-3, -1).contiguous()
+
+    return conv_out
+
+
 @impl(m, "avg_pool2d")
 def avg_pool2d(
     input_tensor: torch.Tensor,
@@ -1066,7 +1127,6 @@ def quantized_relu_common(
 
 
 def quantized_relu_variant(
-    per_tensor: bool,
     dtype: torch.dtype | None = None,
 ) -> Callable[[Callable[..., torch.Tensor]], Callable[..., torch.Tensor]]:
     """Create a quantized relu variant with type checking."""
@@ -1074,43 +1134,20 @@ def quantized_relu_variant(
     def decorator(_: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
         def variant(
             X: torch.Tensor,
-            X_zero_point: torch.Tensor | int,
+            X_zero_point: int,
             out_zero_point: int,
-            out_multiplier: torch.Tensor | int,
-            out_shift: torch.Tensor | int,
+            out_multiplier: int,
+            out_shift: int,
         ) -> torch.Tensor:
-            if per_tensor:
-                if dtype and X.dtype != dtype:
-                    raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}")
-
-                assert isinstance(out_shift, int)
-                assert isinstance(out_multiplier, int)
-                _out_shift = out_shift
-                _out_multiplier = out_multiplier
-            else:
-                assert isinstance(out_multiplier, torch.Tensor)
-                if out_multiplier.numel() > 1:
-                    raise ValueError("Only scalar out_multiplier is supported")
-
-                assert isinstance(out_shift, torch.Tensor)
-                if out_shift.numel() > 1:
-                    raise ValueError("Only scalar out_shift is supported")
-
-                assert isinstance(X_zero_point, torch.Tensor)
-                if X_zero_point.shape != X.shape:
-                    raise ValueError(
-                        f"X_zero_point shape must be {X.shape}. Got {X_zero_point.shape}"
-                    )
-
-                _out_multiplier = int(out_multiplier.item())
-                _out_shift = int(out_shift.item())
+            if dtype and X.dtype != dtype:
+                raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}")
 
             return quantized_relu_common(
                 X,
                 X_zero_point,
                 out_zero_point,
-                _out_multiplier,
-                _out_shift,
+                out_multiplier,
+                out_shift,
             )
 
         return variant
@@ -1118,23 +1155,18 @@ def variant(
     return decorator
 
 
-@impl(m, "quantized_relu")
-@quantized_relu_variant(False)
-def quantized_relu() -> torch.Tensor: ...
-
-
 @impl(m, "quantized_relu.per_tensor")
-@quantized_relu_variant(True)
+@quantized_relu_variant()
 def quantized_relu_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_relu_asym8s_asym8s.per_tensor")
-@quantized_relu_variant(True, torch.int8)
+@quantized_relu_variant(torch.int8)
 def quantized_relu_asym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_relu_asym8u_asym8u.per_tensor")
-@quantized_relu_variant(True, torch.uint8)
+@quantized_relu_variant(torch.uint8)
 def quantized_relu_asym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
@@ -1244,3 +1276,303 @@ def rope(
         [x0 * cos_tensor - x1 * sin_tensor, x0 * sin_tensor + x1 * cos_tensor], dim=-1
     )
     return rotated.view(original_shape)
+
+
+@impl(m, "im2row")
+def im2row(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    in_zero_point: torch.Tensor,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    """
+    Converts an input tensor into a 2D matrix where each row is a flattened sliding window (patch)
+    from the input, suitable for use in convolution as a matrix multiplication (im2row).
+
+    Args:
+        - input_tensor: Input tensor of shape (N, C, H, W) or (N, H, W, C) if channel_last.
+        - kernel_size: Size of the convolution kernel.
+        - dilation: Dilation of the convolution kernel.
+        - padding: Padding to apply to the input.
+        - stride: Stride of the convolution.
+        - in_zero_point : Zero point for input quantization (broadcastable to input).
+        - channel_last: If True, input is in NHWC format, else NCHW.
+
+    Returns:
+        - Tensor of shape (N, num_patches, patch_size)
+    """
+    if len(input_tensor.shape) == 3:
+        height_dim = 1 if channel_last else 2
+        input_tensor = input_tensor.unsqueeze(height_dim)
+
+    if in_zero_point is not None:
+        if in_zero_point.numel() != 1 and in_zero_point.shape != (
+            input_tensor.shape[0],
+        ):
+            raise ValueError(
+                f"Input zero point must be a scalar or broadcastable to input shape {input_tensor.shape}"
+            )
+        if in_zero_point.dtype != torch.int32:
+            raise ValueError("Input zero point must be an int32 tensor")
+
+    if channel_last:
+        input_tensor = input_tensor.movedim(-1, -3).contiguous()  # NHWC -> NCHW
+
+    N, C, H, W = input_tensor.shape
+    kH, kW = kernel_size
+    dH, dW = dilation
+    pH, pW = padding
+    sH, sW = stride
+
+    # Handle padding with zero point values
+    if in_zero_point is not None and (pH > 0 or pW > 0):
+        # Expand zero point to (N, 1, 1, 1) for broadcasting
+        in_zero_point = in_zero_point.expand(N)
+
+        # Pad input with the per-batch zero point values
+        input_tensor = torch.stack(
+            [
+                torch.nn.functional.pad(
+                    input_tensor[i],
+                    (pW, pW, pH, pH),
+                    mode="constant",
+                    value=in_zero_point[i].item(),
+                )
+                for i in range(len(input_tensor))
+            ]
+        )
+
+        padding = (0, 0)  # Already padded manually
+
+    # Use unfold to extract sliding local blocks
+    # Unfold: (N, C, H, W) -> (N, C, L, kH, kW), where L = number of sliding windows
+    # torch.nn.functional.unfold returns (N, C*kH*kW, L)
+    patches = torch.nn.functional.unfold(
+        input_tensor.float(),  # unfold not implemented for int
+        kernel_size=(kH, kW),
+        dilation=(dH, dW),
+        padding=padding,
+        stride=(sH, sW),
+    ).to(
+        input_tensor.dtype
+    )  # (N, C*kH*kW, L)
+
+    # Transpose to (N, L, C*kH*kW)
+    patches = patches.transpose(1, 2).contiguous()
+
+    # Reshape to (N*L, C*kH*kW)
+    patches = patches.view(N, -1, C * kH * kW)
+
+    # If channel_last, output should be in NHWC patch order (but im2row is always row-major)
+    return patches
+
+
+@impl(m, "im2row.per_tensor")
+def im2row_per_tensor(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    in_zero_point: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    return im2row(
+        input_tensor,
+        kernel_size,
+        dilation,
+        padding,
+        stride,
+        torch.tensor(in_zero_point, dtype=torch.int32),
+        channel_last,
+    )
+
+
+@impl(m, "transposed_im2row")
+def transposed_im2row(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    output_padding: tuple[int, int],
+    in_zero_point: torch.Tensor,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    """
+    Converts input tensor patches into im2row format for transposed convolutions.
+    This function extracts patches from input in a pattern suitable for transposed convolution.
+
+    Args:
+        - input_tensor: Input spatial tensor, NCHW or NHWC format (3D or 4D).
+        - kernel_size: Size of the convolution kernel.
+        - dilation: Dilation of the convolution kernel.
+        - padding: Padding to apply to the input.
+        - stride: Stride of the convolution.
+        - output_padding: Additional output padding for transposed convolution.
+        - in_zero_point: Zero point for input quantization (broadcastable to input).
+        - channel_last: If True, input is in NHWC format, else NCHW.
+
+    Returns:
+        - 3D tensor of shape (N, output_h * output_w, kernel_h * kernel_w * in_c)
+    """
+    # Handle 1D convolution case by adding height dimension
+    if len(input_tensor.shape) == 3:
+        height_dim = 1 if channel_last else 2
+        input_tensor = input_tensor.unsqueeze(height_dim)
+
+    if in_zero_point is not None:
+        if in_zero_point.dtype != torch.int32:
+            raise ValueError("Input zero point must be an int32 tensor")
+
+    # Move to NCHW for processing if needed
+    if channel_last:
+        input_tensor = input_tensor.movedim(-1, -3).contiguous()  # NHWC -> NCHW
+
+    N, C, H_in, W_in = input_tensor.shape
+
+    # Output: (N, C*H_in*W_in, H_out, W_out)
+    H_out = (
+        (H_in - 1) * stride[0]
+        + kernel_size[0]
+        + output_padding[0]
+        - 2 * padding[0]
+        + dilation[0] * (kernel_size[0] - 1)
+    )
+    W_out = (
+        (W_in - 1) * stride[1]
+        + kernel_size[1]
+        + output_padding[1]
+        - 2 * padding[1]
+        + dilation[1] * (kernel_size[1] - 1)
+    )
+
+    # For each input pixel, create a channel where the upsampled (transposed conv) patch is placed
+    # Output: (N, C*H_in*W_in, H_out, W_out)
+    inp_flat = input_tensor.reshape(N, C * H_in * W_in)
+
+    # Calculate output spatial size
+    H_out = (
+        (H_in - 1) * stride[0]
+        - 2 * padding[0]
+        + dilation[0] * (kernel_size[0] - 1)
+        + output_padding[0]
+        + 1
+    )
+    W_out = (
+        (W_in - 1) * stride[1]
+        - 2 * padding[1]
+        + dilation[1] * (kernel_size[1] - 1)
+        + output_padding[1]
+        + 1
+    )
+
+    # Compute the upsampled (top-left) position for each input pixel
+    h_idx = torch.arange(H_in, device=input_tensor.device)
+    w_idx = torch.arange(W_in, device=input_tensor.device)
+    grid_h, grid_w = torch.meshgrid(h_idx, w_idx, indexing="ij")
+    out_h_idx = grid_h * stride[0] - padding[0]
+    out_w_idx = grid_w * stride[1] - padding[1]
+
+    # Compute all input pixel positions (flattened)
+    ch_idx = torch.arange(C * H_in * W_in, device=input_tensor.device)
+    ij_idx = ch_idx % (H_in * W_in)
+    i_idx = ij_idx // W_in
+    j_idx = ij_idx % W_in
+
+    # For each input pixel, compute the output positions for the kernel window
+    kh_idx = torch.arange(kernel_size[0], device=input_tensor.device)
+    kw_idx = torch.arange(kernel_size[1], device=input_tensor.device)
+    kh_grid, kw_grid = torch.meshgrid(kh_idx, kw_idx, indexing="ij")
+    kh_grid = kh_grid.reshape(-1)
+    kw_grid = kw_grid.reshape(-1)
+    num_kernel = kernel_size[0] * kernel_size[1]
+
+    # Broadcast to all channels and kernel positions
+    ch_idx_b = ch_idx.repeat_interleave(num_kernel)
+    n_kernel = ch_idx.shape[0] * num_kernel
+
+    i_idx_b = i_idx.repeat_interleave(num_kernel)
+    j_idx_b = j_idx.repeat_interleave(num_kernel)
+    kh_b = kh_grid.repeat(ch_idx.shape[0])
+    kw_b = kw_grid.repeat(ch_idx.shape[0])
+
+    h_out = out_h_idx[i_idx_b, j_idx_b] + kh_b * dilation[0]
+    w_out = out_w_idx[i_idx_b, j_idx_b] + kw_b * dilation[1]
+
+    # Mask for valid output positions
+    valid = (h_out >= 0) & (h_out < H_out) & (w_out >= 0) & (w_out < W_out)
+
+    # Prepare indices for advanced indexing
+    n_idx = (
+        torch.arange(N, device=input_tensor.device)
+        .view(-1, 1)
+        .expand(N, n_kernel)
+        .reshape(-1)
+    )
+    ch_idx_full = ch_idx_b.expand(N, n_kernel).reshape(-1)
+    h_out_full = h_out.expand(N, n_kernel).reshape(-1)
+    w_out_full = w_out.expand(N, n_kernel).reshape(-1)
+    valid_full = valid.expand(N, n_kernel).reshape(-1)
+
+    # Gather input values for each channel
+    inp_vals = inp_flat[:, ch_idx_b].reshape(-1)
+
+    # Create output tensor
+    patches = torch.zeros((N, C * H_in * W_in, H_out, W_out), dtype=input_tensor.dtype)
+
+    # If in_zero_point is provided, fill patches with it
+    if in_zero_point is not None:
+        if in_zero_point.numel() == 1:
+            patches.fill_(in_zero_point.item())
+        else:
+            # Broadcast in_zero_point to (N, C, H_in, W_in)
+            assert in_zero_point.shape == (N,)
+            in_zero_point = in_zero_point.view(N, 1, 1, 1)
+            patches = patches + in_zero_point
+
+    # Scatter input values to output positions (only valid positions)
+    patches[
+        n_idx[valid_full],
+        ch_idx_full[valid_full],
+        h_out_full[valid_full],
+        w_out_full[valid_full],
+    ] = inp_vals[valid_full]
+
+    # Optionally, flatten to (N, num_patches, patch_size) if needed
+    patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous()
+    return patches
+
+
+@impl(m, "quantized_embedding_byte")
+def quantized_embedding_byte(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    if pruned_weights:
+        raise NotImplementedError("Pruned weights not supported")
+
+    # Cannot use torch.ops.quantized_decomposed.embedding_byte.dtype because
+    # it doesn't support num_groups == 1
+    num_groups = 1
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[1]
+
+    group_size = weight.shape[1] // num_groups
+    weight = torch.ops.torchao.dequantize_affine.default(
+        input=weight,
+        block_size=(1, group_size),
+        scale=weight_scales,
+        zero_point=weight_zero_points,
+        input_dtype=weight.dtype,
+        quant_min=torch.iinfo(weight.dtype).min,
+        quant_max=torch.iinfo(weight.dtype).max,
+    )
+
+    return weight[indices]
diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index 663c5825e52..263d3a521f3 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -9,7 +9,7 @@
 
 import logging
 from dataclasses import dataclass, field
-from typing import cast, List, Optional, Sequence, Set
+from typing import cast, List, Optional, Sequence, Set, Type
 
 import torch
 import torch.fx
@@ -926,19 +926,28 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         return super().call(graph_module)
 
 
+class CommonRemovePasses:
+    passes: List[Type[ExportPass]] = [
+        RemoveCloneOpPass,
+        RemoveAliasCopyOpPass,
+        RemoveNopExpandOpPass,
+        RemoveNopSliceOrViewOpPass,
+        RemoveNopSelectOpPass,
+        RemoveToOpsPass,
+        RemoveZeroSizedCatArgsPass,
+        RemovePermutesAroundElementwiseOps,
+        RemoveSqueezeViewBeforeElementwiseOps,
+        RemoveCatFromSliceCopyPass,
+    ]
+
+
 class CadenceRemoveNops:
-    passes = [
+    passes: List[Type[ExportPass]] = CommonRemovePasses.passes + [
         SimplifySliceOpPass,
         RemoveCloneOpsTransformImported,
-        RemoveToOpsPass,
         RemoveNopRequantizeOpPass,
-        RemoveZeroSizedCatArgsPass,
-        RemoveNopSliceOrViewOpPass,
-        RemoveNopExpandOpPass,
         RemoveZeroSizedConstantPadNd,
-        RemoveCloneOpPass,
         RemoveContiguousOpPass,
-        RemoveAliasCopyOpPass,
         RemoveNopMulOpPass,
         RemoveNopAddOpPass,
         RemoveNopLinalgVectorNormOpPass,
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 8de0af7311d..3cfc059e75b 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -43,7 +43,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
-from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
 # A map to represent ops that:
@@ -90,14 +89,10 @@ def replace_logical_nop_where_with_where(
 
             # Get the third arg node and its input
             logical_not_node = node.args[0]
-            logical_not_input_tensor = (
-                logical_not_node.args[0].to_tensor()
-                if isinstance(logical_not_node.args[0], ProxyValue)
-                else logical_not_node.args[0]
-            )
+            logical_not_input_node = logical_not_node.args[0]
 
             # If the logical_not input is not a boolean tensor, bail.
-            if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
+            if logical_not_input_node.meta["val"].dtype != torch.bool:
                 continue
 
             # Replace the where op with another one, flipping the inputs and using the boolean
@@ -263,7 +258,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Glean the shape of input and output tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         in_shape = in_tensor.shape
         out_shape = meta["val"].shape
         # Get the select dimension
@@ -295,7 +290,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Create a zero bias tensor, and insert it as a graph buffer before the
         # current node
-        mat2_tensor = mat2.to_tensor() if isinstance(mat2, ProxyValue) else mat2
+        mat2_tensor = mat2.to_tensor()
         bias_size = mat2_tensor.size(1)
         zero_bias = super().call_operator(
             exir_ops.edge.aten.full.default,
@@ -410,7 +405,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the old dim and new dim order
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         old_dims = tuple(range(in_tensor.dim()))
         new_dims = args[1]
 
@@ -488,11 +483,7 @@ def call_operator(self, op, args, kwargs, meta):
         repeats = args[1]
 
         # Glean the shapes of input tensor
-        in_shape = list(
-            in_tensor.to_tensor().shape
-            if isinstance(in_tensor, ProxyValue)
-            else in_tensor.shape
-        )
+        in_shape = list(in_tensor.to_tensor().shape)
 
         # If the size of repeats is more than the dimensionality of the tensor,
         # the output of repeat will be a higher-dimensional tensor. We reshape
@@ -793,15 +784,9 @@ def call_operator(self, op, args, kwargs, meta):
         (in_tensor, weight, bias, stride, padding, dilation, groups) = args[0:7]
 
         # Glean the shapes of input, weight, and output
-        in_shape = (
-            in_tensor.to_tensor().shape
-            if isinstance(in_tensor, ProxyValue)
-            else in_tensor.shape
-        )
+        in_shape = in_tensor.to_tensor().shape
 
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         out_shape = meta["val"].shape
         assert None not in {in_shape, weight_shape, out_shape}
 
@@ -823,26 +808,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Reshape the weight to [out_channels, in_channels * X]
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Reshape the input from 3d to 2d tensor
         in_view = super().call_operator(
@@ -865,11 +840,7 @@ def call_operator(self, op, args, kwargs, meta):
                 out_zero_point,
             ) = args[7:12]
             # If the multiplier and shift tensors are provided, use them.
-            if (
-                len(args) >= 14
-                and isinstance(args[12], ProxyValue)
-                and isinstance(args[13], ProxyValue)
-            ):
+            if len(args) >= 14:
                 out_multiplier = args[12]
                 out_shift = args[13]
             # If not, compute them.
@@ -1073,9 +1044,7 @@ def call_operator(self, op, args, kwargs, meta):
         if groups != 1:
             return super().call_operator(op, args, kwargs, meta)
 
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         # If this is a pointwise convolution, im2col will start dominating the
         # runtime. So we call convolution op for this case.
         if (
@@ -1114,8 +1083,6 @@ def call_operator(self, op, args, kwargs, meta):
                     {"dtype": torch.int32},
                     meta,
                 )
-                if isinstance(in_tensor.to_tensor(), FakeTensor)
-                else get_zero_point(in_tensor.to_tensor())
             )
             if quantized_op
             else torch.tensor(0, dtype=torch.int32)
@@ -1151,26 +1118,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Get the product of the >2 dims of the weight
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Create the linear node, which multiplies the 3d input with 2d weight
         # tensors with bias addition. The outermost dimension of the input is
@@ -1184,11 +1141,7 @@ def call_operator(self, op, args, kwargs, meta):
                 out_zero_point,
             ) = args[7:12]
             # If the multiplier and shift tensors are provided, use them.
-            if (
-                len(args) >= 14
-                and isinstance(args[12], ProxyValue)
-                and isinstance(args[13], ProxyValue)
-            ):
+            if len(args) >= 14:
                 out_multiplier = args[12]
                 out_shift = args[13]
             # If not, compute them.
@@ -1276,9 +1229,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Get the shapes
         out_shape = meta["val"].shape
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         assert None not in {weight_shape, out_shape}
 
         # Determine if the transposed_convolution is NCHW or NHWC. The NHWC,
@@ -1332,26 +1283,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Reshape the weight to [out_channels, in_channels * X]
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Create the linear node, which multiplies the 3d input with 2d weight
         # tensors with bias addition. The outermost dimension of the input is
@@ -1422,7 +1363,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the input tensor and shape
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         in_shape = in_tensor.shape
         # Get the output tensor shape
         out_shape = meta["val"].shape
@@ -1491,7 +1432,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Extract the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         leading_dims = math.prod(in_tensor.shape[:-1])
         # If the tensor is not a vector, do nothing.
         if leading_dims != 1:
@@ -1557,11 +1498,7 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(
             exir_ops.edge.aten.full.default,
             (
-                (
-                    args[0].to_tensor().shape
-                    if isinstance(args[0], ProxyValue)
-                    else args[0].shape
-                ),
+                args[0].to_tensor().shape,
                 args[1],
             ),
             {},
@@ -1602,60 +1539,58 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     replaced_scalar_args: dict[
         EdgeOpOverloadPacket, tuple[EdgeOpOverload, Sequence[int]]
     ] = {
-        exir_ops.edge.cadence.quantized_add: (
+        exir_ops.edge.cadence.quantized_add.default: (
             exir_ops.edge.cadence.quantized_add.per_tensor,
             [1, 2, 4, 5],
         ),
-        exir_ops.edge.cadence.quantized_conv2d_nchw: (
+        exir_ops.edge.cadence.quantized_conv2d_nchw.default: (
             exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
             [8, 9, 12, 13],
         ),
-        exir_ops.edge.cadence.quantized_conv2d_nhwc: (
+        exir_ops.edge.cadence.quantized_conv2d_nhwc.default: (
             exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
             [8, 9, 12, 13],
         ),
-        exir_ops.edge.cadence.quantized_fully_connected: (
+        exir_ops.edge.cadence.quantized_fully_connected.default: (
             exir_ops.edge.cadence.quantized_fully_connected.per_tensor,
             [4, 5, 6],
         ),
-        exir_ops.edge.cadence.quantized_layer_norm: (
+        exir_ops.edge.cadence.quantized_layer_norm.default: (
             exir_ops.edge.cadence.quantized_layer_norm.per_tensor,
             [1, 2],
         ),
-        exir_ops.edge.cadence.quantized_linear: (
+        exir_ops.edge.cadence.quantized_linear.default: (
             exir_ops.edge.cadence.quantized_linear.per_tensor,
             [4, 5, 6],
         ),
-        exir_ops.edge.cadence.quantized_relu: (
+        exir_ops.edge.cadence.quantized_relu.default: (
             exir_ops.edge.cadence.quantized_relu.per_tensor,
             [1, 3, 4],
         ),
-        exir_ops.edge.cadence.im2row: (
+        exir_ops.edge.cadence.im2row.default: (
             exir_ops.edge.cadence.im2row.per_tensor,
             [5],
         ),
-        exir_ops.edge.cadence.requantize: (
+        exir_ops.edge.cadence.requantize.default: (
             exir_ops.edge.cadence.requantize.per_tensor,
             [1, 2, 3, 4],
         ),
     }
 
     def call_operator(self, op, args, kwargs, meta):
-        op_edge_overload_packet = get_edge_overload_packet(op)
-
-        if op_edge_overload_packet not in self.replaced_scalar_args:
+        if op not in self.replaced_scalar_args:
             return super().call_operator(op, args, kwargs, meta)
 
         # Get all the args that need to be replaced.
-        new_op, args_to_be_replaced = self.replaced_scalar_args[op_edge_overload_packet]
+        new_op, args_to_be_replaced = self.replaced_scalar_args[op]
+
+        if op == new_op:
+            return super().call_operator(op, args, kwargs, meta)
 
         updated_args = list(args)
         for op_arg_index in args_to_be_replaced:
             arg = args[op_arg_index]
-            if not isinstance(arg, ProxyValue):
-                return super().call_operator(op, args, kwargs, meta)
-
-            if not arg.is_tensor():
+            if not isinstance(arg, ProxyValue) or not arg.is_tensor():
                 return super().call_operator(op, args, kwargs, meta)
 
             if not isinstance(arg.node.target, EdgeOpOverload):
@@ -1696,7 +1631,7 @@ def call_operator(self, op, args, kwargs, meta):
         # Determine if the op is avg_pool1d or avg_pool2d
         avg_pool1d: bool = op == exir_ops.edge.aten.avg_pool1d.default
         # Get the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
 
         # Replace avg_pool2d with custom avg_pool2d, and if the input tensor is
         # quantized, pass its zero_point tensor as arg to the custom avg_pool2d.
@@ -1709,7 +1644,7 @@ def call_operator(self, op, args, kwargs, meta):
         ceil_mode = args[4] if len(args) >= 5 else False
         count_include_pad = args[5] if len(args) >= 6 else True
         divisor_override = args[6] if len(args) >= 7 else None
-        zero_point = torch.tensor(0, dtype=torch.int32)
+        zero_point = args[7] if len(args) >= 8 else None
 
         # If the op is avg_pool1d, then we need to reshape the 3d input to a 4d
         # tensor.
@@ -2062,7 +1997,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the second tensor
-        Y_tensor = Y_arg.to_tensor() if isinstance(Y_arg, ProxyValue) else Y_arg
+        Y_tensor = Y_arg.to_tensor()
         # Concretize the bias
         zero_bias = super().call_operator(
             exir_ops.edge.aten.full.default,
@@ -2071,19 +2006,14 @@ def call_operator(self, op, args, kwargs, meta):
             meta,
         )
 
-        # If the arg was a ProxyValue, insert a transpose node. Otherwise we
-        # can simply transpose the tensor inplace.
-        if isinstance(Y_arg, ProxyValue):
-            transpose_args = (Y_arg, -1, -2)
-            transpose_node = super().call_operator(
-                exir_ops.edge.aten.transpose_copy.int,
-                transpose_args,
-                {},
-                meta,
-            )
-            Y_arg_t = transpose_node
-        else:
-            Y_arg_t = Y_tensor.transpose(-1, -2)
+        # Y_arg is always a ProxyValue, so we insert a transpose node
+        transpose_args = (Y_arg, -1, -2)
+        Y_arg_t = super().call_operator(
+            exir_ops.edge.aten.transpose_copy.int,
+            transpose_args,
+            {},
+            meta,
+        )
 
         # Construct the new args, and return the transposed matmult op
         new_args = (
@@ -2178,7 +2108,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         # Permute NCHW to NHWC for computation
         in_tensor_permuted = in_tensor.permute(0, 2, 3, 1)
         in_tensor_shape = in_tensor_permuted.shape
@@ -2226,6 +2156,52 @@ def call_operator(self, op, args, kwargs, meta):
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding(ExportPass):
+    """
+    Replace torch.ops.quantized_decomposed.embedding_byte.dtype with
+    torch.ops.cadence.quantized_embedding_byte
+    """
+
+    def call_operator(
+        self,
+        op: torch._ops.OpOverload,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        # Check if the op is the quantized_decomposed.embedding_byte.dtype
+        if (
+            op == exir_ops.edge.quantized_decomposed.embedding_byte.default
+            or op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+        ):
+            # Replace with cadence.quantized_embedding_byte
+            if len(args) < 6:
+                raise AssertionError(
+                    f"Expected 6 arguments for embedding_byte, got {len(args)}"
+                )
+            embedding = args[0]
+            scales = args[1]
+            weight_zero_points = args[2]
+            indices = args[5]
+            if op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype:
+                dtype = kwargs.get("dtype", None)
+                if dtype is not None and dtype != torch.float32:
+                    raise AssertionError(
+                        f"Unsupported output dtype for embedding_byte: {dtype}"
+                    )
+
+            new_args = (embedding, scales, weight_zero_points, indices, False)
+            new_kwargs = {}
+            return super().call_operator(
+                exir_ops.edge.cadence.quantized_embedding_byte.default,
+                new_args,
+                new_kwargs,
+                meta,
+            )
+        return super().call_operator(op, args, kwargs, meta)
+
+
 class CommonReplacePasses:
     passes = [
         ReplaceSqueezeAndUnsqueezeWithViewPass,
@@ -2238,6 +2214,7 @@ class CommonReplacePasses:
         ReplacePT2QuantWithCadenceQuantPass,
         ReplacePT2DequantWithCadenceDequantPass,
         ReplacePowWithMulPass,
+        ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding,
     ]
 
 
diff --git a/backends/cadence/aot/simplify_ops.py b/backends/cadence/aot/simplify_ops.py
index bf836f09044..92c14cb0f5d 100644
--- a/backends/cadence/aot/simplify_ops.py
+++ b/backends/cadence/aot/simplify_ops.py
@@ -19,7 +19,7 @@
 from executorch.backends.cadence.aot.utils import rebind
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.pass_base import ExportPass, ProxyValue
+from executorch.exir.pass_base import ExportPass
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
@@ -75,7 +75,7 @@ def call_operator(self, op, args, kwargs, meta):
         slice_scatter = op == exir_ops.edge.aten.slice_scatter.default
         # Parse the arguments
         # Extract the tensor to be sliced, and the slicing dimension
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         dim = args[1 + slice_scatter] if len(args) > 1 + slice_scatter else 0
         # Make dim non-negative
         dim = dim if dim >= 0 else dim + in_tensor.dim()
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 32e9b43e68e..259752f3893 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -172,7 +172,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor([[0]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -197,7 +197,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor([[-2, -8]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -220,7 +220,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor([[0, 0]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -244,7 +244,7 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor(
                         [[[0, -2, -4], [-2, -7, -12]]], dtype=dtype
@@ -270,7 +270,7 @@ def test_quantized_add(
                     torch.tensor(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (1.0 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     1,  # out_zero_point
                     torch.tensor([[1, 1]], dtype=dtype),  # expected_output
                     per_tensor,
@@ -295,7 +295,7 @@ def test_quantized_add(
                     torch.tensor(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (1.0 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     1,  # out_zero_point
                     torch.tensor([[1, 1]], dtype=dtype),  # expected_output
                     False,
@@ -317,7 +317,7 @@ def test_quantized_add(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (0.125 * 2^31)
                     torch.tensor(
-                        [1], dtype=torch.int64
+                        [1], dtype=torch.int32
                     ),  # out_shift (shift=1, doubles the scale)
                     1,  # out_zero_point
                     torch.tensor([[1, 2]], dtype=dtype),  # expected_output
@@ -339,7 +339,7 @@ def test_quantized_add(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (0.125 * 2^31)
                     torch.tensor(
-                        [1], dtype=torch.int64
+                        [1], dtype=torch.int32
                     ),  # out_shift (shift=1, doubles the scale)
                     1,  # out_zero_point
                     torch.tensor([[1, 2]], dtype=dtype),  # expected_output
@@ -350,6 +350,29 @@ def test_quantized_add(
                 for (matmul, transposed_matmul) in ((True, False), (True, True))
                 for (per_tensor, dtype) in ((True, torch.int8),)
             ],
+            *[
+                (
+                    torch.Size([2, 1, 2]),  # src_shape: 1 sample, 2 input features
+                    torch.Size(
+                        [2, 2, 2]
+                    ),  # weight_shape: 2 output features, 2 input features
+                    2,  # in_zero_point
+                    torch.tensor([1, 1], dtype=dtype),  # weight_zero_point
+                    torch.tensor(
+                        [268435456], dtype=torch.int32
+                    ),  # out_multiplier (0.125 * 2^31)
+                    torch.tensor(
+                        [1], dtype=torch.int32
+                    ),  # out_shift (shift=1, doubles the scale)
+                    1,  # out_zero_point
+                    torch.tensor([[[1, 2]], [[0, -1]]], dtype=dtype),  # expected_output
+                    per_tensor,
+                    matmul,
+                    transposed_matmul,
+                )
+                for (matmul, transposed_matmul) in ((True, False), (True, True))
+                for (per_tensor, dtype) in ((True, torch.int8),)
+            ],
         ]
     )
     def test_quantized_linear(
@@ -380,7 +403,7 @@ def test_quantized_linear(
             .to(expected_output.dtype)
         )
         if matmul and not transposed_matmul:
-            weight = weight.T
+            weight = weight.transpose(-1, -2)
 
         if per_tensor:
             weight_zero_point = weight_zero_point[0]
@@ -1080,61 +1103,39 @@ def test_quantized_conv_per_tensor(
                 )
                 for dtype in [torch.uint8]
             ],
-            # Test case 4: Non-per-tensor
-            *[
-                (
-                    "non_per_tensor",
-                    torch.tensor([-1, -2, -3, 1, 2, 3], dtype=dtype),  # input
-                    torch.tensor([0, 0, 0, 1, 1, 1]),  # X_zero_point
-                    5,  # out_zero_point
-                    torch.tensor([1073741824]),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([1]),  # out_shift (multiply by 2^1 = 2)
-                    dtype,  # dtype
-                    torch.tensor([5, 5, 5, 5, 4, 3], dtype=dtype),
-                )
-                for dtype in [torch.int8]
-            ],
         ]
     )
     def test_quantized_relu(
         self,
         name: str,
         X: torch.Tensor,
-        X_zero_point: torch.Tensor | int,
+        X_zero_point: int,
         out_zero_point: int,
-        out_multiplier: torch.Tensor | int,
-        out_shift: torch.Tensor | int,
+        out_multiplier: int,
+        out_shift: int,
         dtype: torch.dtype,
         expected_output: torch.Tensor,
     ) -> None:
 
-        if isinstance(X_zero_point, int):
-            assert isinstance(out_multiplier, int)
-            assert isinstance(out_shift, int)
-
-            match dtype:
-                case torch.int8:
-                    quantized_relu = (
-                        torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor
-                    )
-                case torch.uint8:
-                    quantized_relu = (
-                        torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor
-                    )
-                case _:
-                    quantized_relu = torch.ops.cadence.quantized_relu_per_tensor
+        match dtype:
+            case torch.int8:
+                quantized_relu = (
+                    torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor
+                )
+            case torch.uint8:
+                quantized_relu = (
+                    torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor
+                )
+            case _:
+                quantized_relu = torch.ops.cadence.quantized_relu_per_tensor
 
-            output = quantized_relu(
-                X,
-                X_zero_point,
-                out_zero_point,
-                out_multiplier,
-                out_shift,
-            )
-        else:
-            output = torch.ops.cadence.quantized_relu(
-                X, X_zero_point, out_zero_point, out_multiplier, out_shift
-            )
+        output = quantized_relu(
+            X,
+            X_zero_point,
+            out_zero_point,
+            out_multiplier,
+            out_shift,
+        )
 
         # Verify output properties
         self.assertEqual(output.dtype, dtype, f"Output dtype should be {dtype}")
@@ -1534,6 +1535,143 @@ def test_convolution(
             f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
         )
 
+    @expand(
+        [
+            # Basic 2D transposed convolution with stride=1 (current test case - corrected name)
+            (
+                "basic_2d_stride1",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                False,  # channel_last
+                torch.tensor(
+                    [[[[1.0, 3.0, 2.0], [4.0, 10.0, 6.0], [3.0, 7.0, 4.0]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2D transposed convolution with channel_last=True (NHWC format)
+            (
+                "channel_last_nhwc",
+                torch.tensor(
+                    [[[[1.0], [2.0]], [[3.0], [4.0]]]], dtype=torch.float32
+                ),  # input: 1x2x2x1 (NHWC)
+                torch.tensor(
+                    [[[[1.0], [1.0]], [[1.0], [1.0]]]], dtype=torch.float32
+                ),  # weight: 1x2x2x1 (NHWC)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                True,  # channel_last=True
+                torch.tensor(
+                    [
+                        [
+                            [[1.0], [3.0], [2.0]],
+                            [[4.0], [10.0], [6.0]],
+                            [[3.0], [7.0], [4.0]],
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2D transposed convolution with non-zero bias
+            (
+                "with_bias",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([5.0], dtype=torch.float32),  # bias=5.0
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                False,  # channel_last
+                torch.tensor(
+                    [[[[6.0, 7.0, 5.0], [8.0, 10.0, 7.0], [5.0, 8.0, 9.0]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 1D transposed convolution (3D tensor, NLC format)
+            (
+                "conv1d_nlc",
+                torch.tensor(
+                    [[[1.0], [2.0], [3.0]]], dtype=torch.float32
+                ),  # input: 1x3x1 (NLC)
+                torch.tensor(
+                    [[[1.0], [0.5]]], dtype=torch.float32
+                ),  # weight: 1x2x1 (NLC)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (2, 0),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                True,  # channel_last=True
+                torch.tensor(
+                    [[[1.0], [0.5], [2.0], [1.0], [3.0], [1.5]]], dtype=torch.float32
+                ),
+            ),
+        ]
+    )
+    def test_transposed_convolution(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: tuple[int, int],
+        padding: tuple[int, int],
+        dilation: tuple[int, int],
+        groups: int,
+        output_padding: tuple[int, int],
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.transposed_convolution(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            output_padding,
+            groups,
+            channel_last,
+        )
+
+        # Verify output properties
+        self.assertEqual(
+            output.dtype,
+            input_tensor.dtype,
+            f"Output dtype should match input dtype in {name}",
+        )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"Output shape should match expected shape in {name}",
+        )
+
+        # Verify output matches expected values
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+        )
+
     @expand(
         [
             # Basic non-quantized average pooling
@@ -1706,3 +1844,579 @@ def test_avg_pool2d(
                 torch.equal(output, expected_output),
                 f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
             )
+
+    @expand(
+        [
+            # Basic 2x2 kernel, stride 1, no padding, NCHW
+            (
+                "nchw_basic_2x2",
+                torch.tensor(
+                    [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+                ),  # (N=1, C=1, H=3, W=3)
+                (2, 2),  # kernel_size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (1, 1),  # stride
+                None,  # in_zero_point
+                False,  # channel_last
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 2, no padding, NCHW
+            (
+                "nchw_stride2",
+                torch.tensor(
+                    [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (2, 2),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5]],
+                    ],
+                    dtype=torch.float32,  # Only every other patch in each dim
+                ),
+            ),
+            # 2x2 kernel, stride 1, padding 1, NCHW
+            (
+                "nchw_padding1",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.float32),  # (1,1,2,2)
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (1, 1),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [0, 0, 0, 1],
+                            [0, 0, 1, 2],
+                            [0, 0, 2, 0],
+                            [0, 1, 0, 3],
+                            [1, 2, 3, 4],
+                            [2, 0, 4, 0],
+                            [0, 3, 0, 0],
+                            [3, 4, 0, 0],
+                            [4, 0, 0, 0],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NHWC
+            (
+                "nhwc_basic_2x2",
+                torch.tensor(
+                    [[[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]]],
+                    dtype=torch.float32,
+                ),  # (N=1, H=3, W=3, C=1)
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                None,
+                True,
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NCHW, in_zero_point=1
+            (
+                "nchw_in_zero_point_no_padding",
+                torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor(1, dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [[2, 3, 5, 6], [3, 4, 6, 7], [5, 6, 8, 9], [6, 7, 9, 10]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            (
+                "nchw_in_zero_point_with_padding=1_and_stride=2",
+                torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (2, 2),
+                torch.tensor(-1, dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [-1, -1, -1, 2],
+                            [-1, -1, 3, 4],
+                            [-1, 5, -1, 8],
+                            [6, 7, 9, 10],
+                        ],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NHWC, in_zero_point=2
+            (
+                "nhwc_in_zero_point",
+                torch.tensor(
+                    [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+                    dtype=torch.int8,
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor(2, dtype=torch.int32),
+                True,
+                False,
+                torch.tensor(
+                    [
+                        [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            # Multi-channel input, 2x2 kernel, stride 1, no padding, NCHW
+            (
+                "nchw_multi_channel",
+                torch.tensor(
+                    [
+                        [
+                            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],  # channel 0
+                            [[10, 11, 12], [13, 14, 15], [16, 17, 18]],  # channel 1
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # (1,2,3,3)
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 2, 4, 5, 10, 11, 13, 14],
+                            [2, 3, 5, 6, 11, 12, 14, 15],
+                            [4, 5, 7, 8, 13, 14, 16, 17],
+                            [5, 6, 8, 9, 14, 15, 17, 18],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # Multi-channel input and multi-channel zero-point
+            (
+                "nchw_multi_channel_and_zero_point_no_padding",
+                torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+                (1, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor([-1, -2], dtype=torch.int32),
+                False,
+                False,
+                torch.tensor([[[1, 2], [2, 3]], [[4, 5], [5, 6]]], dtype=torch.int32),
+            ),
+            (
+                "nchw_multi_channel_and_zero_point_with_padding=1_and_stride=(2, 1)",
+                torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+                (1, 2),
+                (1, 1),
+                (2, 1),
+                (2, 2),
+                torch.tensor([-1, -2], dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [-1, -1],
+                            [-1, -1],
+                            [-1, 1],
+                            [2, 3],
+                            [-1, -1],
+                            [-1, -1],
+                        ],
+                        [
+                            [-2, -2],
+                            [-2, -2],
+                            [-2, 4],
+                            [5, 6],
+                            [-2, -2],
+                            [-2, -2],
+                        ],
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "per_tensor",
+                torch.tensor(
+                    [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+                    dtype=torch.int8,
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                2,
+                True,
+                True,
+                torch.tensor(
+                    [
+                        [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+        ]
+    )
+    def test_im2row(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        kernel_size: tuple[int, int],
+        dilation: tuple[int, int],
+        padding: tuple[int, int],
+        stride: tuple[int, int],
+        in_zero_point: torch.Tensor | None,
+        channel_last: bool,
+        per_tensor: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        if per_tensor:
+            output = torch.ops.cadence.im2row.per_tensor(
+                input_tensor,
+                kernel_size,
+                dilation,
+                padding,
+                stride,
+                in_zero_point,
+                channel_last,
+            )
+        else:
+            output = torch.ops.cadence.im2row(
+                input_tensor,
+                kernel_size,
+                dilation,
+                padding,
+                stride,
+                in_zero_point,
+                channel_last,
+            )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"im2row output shape mismatch in {name}",
+        )
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"im2row output mismatch in {name}: got {output}, expected {expected_output}",
+        )
+
+    @expand(
+        [
+            (
+                "basic_2x2",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                (0, 0),
+                None,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 0, 0, 0],
+                            [1, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [1, 0, 3, 0],
+                            [1, 2, 3, 4],
+                            [0, 2, 0, 4],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 4],
+                            [0, 0, 0, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "basic_2x2_with_zero_point",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                (0, 0),
+                torch.tensor(100, dtype=torch.int32),
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 100, 100, 100],
+                            [1, 2, 100, 100],
+                            [100, 2, 100, 100],
+                            [1, 100, 3, 100],
+                            [1, 2, 3, 4],
+                            [100, 2, 100, 4],
+                            [100, 100, 3, 100],
+                            [100, 100, 3, 4],
+                            [100, 100, 100, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "basic_2x2_with_stride_2",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),  # kernel size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (2, 2),  # stride
+                (0, 0),  # output padding
+                None,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 0, 0, 0],
+                            [1, 0, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [1, 0, 0, 0],
+                            [1, 0, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 0, 4],
+                            [0, 0, 0, 4],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 0, 4],
+                            [0, 0, 0, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "batch2_with_batch2_zero_point",
+                torch.tensor(
+                    [
+                        [[[1, 2], [3, 4]]],
+                        [[[5, 6], [7, 8]]],
+                    ],
+                    dtype=torch.int32,
+                ),  # input: (2,1,2,2)
+                (2, 2),  # kernel_size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (1, 1),  # stride
+                (0, 0),  # output_padding
+                torch.tensor([100, 200], dtype=torch.int32),  # in_zero_point per batch
+                False,  # channel_last
+                torch.tensor(
+                    [
+                        [
+                            [1, 100, 100, 100],
+                            [1, 2, 100, 100],
+                            [100, 2, 100, 100],
+                            [1, 100, 3, 100],
+                            [1, 2, 3, 4],
+                            [100, 2, 100, 4],
+                            [100, 100, 3, 100],
+                            [100, 100, 3, 4],
+                            [100, 100, 100, 4],
+                        ],
+                        [
+                            [5, 200, 200, 200],
+                            [5, 6, 200, 200],
+                            [200, 6, 200, 200],
+                            [5, 200, 7, 200],
+                            [5, 6, 7, 8],
+                            [200, 6, 200, 8],
+                            [200, 200, 7, 200],
+                            [200, 200, 7, 8],
+                            [200, 200, 200, 8],
+                        ],
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+        ]
+    )
+    def test_transposed_im2row(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        kernel_size: tuple[int, int],
+        dilation: tuple[int, int],
+        padding: tuple[int, int],
+        stride: tuple[int, int],
+        output_padding: tuple[int, int],
+        in_zero_point: torch.Tensor | int | None,
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.transposed_im2row(
+            input_tensor,
+            kernel_size,
+            dilation,
+            padding,
+            stride,
+            output_padding,
+            in_zero_point,
+            channel_last,
+        )
+
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"transposed_im2row output shape mismatch in {name}: got {output.shape}, expected {expected_output.shape}",
+        )
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}",
+        )
+
+    @expand(
+        [
+            (
+                "1_group",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [0.0, 0.5, 1.0, 2.0],
+                        [10.0, 12.5, 15.0, 18.0],
+                        [3.0, 4.5, 6.0, 8.0],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups_batch2",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [2, 1, 0]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [
+                            [0.0, 0.5, 1.0, 2.0],
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                        ],
+                        [
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                            [0.0, 0.5, 1.0, 2.0],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+        ]
+    )
+    def test_quantized_embedding_byte(
+        self,
+        name: str,
+        weight: torch.Tensor,
+        weight_scales: torch.Tensor,
+        weight_zero_points: torch.Tensor | None,
+        indices: torch.Tensor,
+        expected_out: torch.Tensor,
+    ) -> None:
+        self.assertTrue(
+            torch.equal(
+                torch.ops.cadence.quantized_embedding_byte(
+                    weight, weight_scales, weight_zero_points, indices
+                ),
+                expected_out,
+            )
+        )
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
index c15755f58c5..e2fbd516757 100644
--- a/backends/cadence/aot/tests/test_replace_ops_passes.py
+++ b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -45,6 +45,7 @@
     ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
     ReplaceSplitWithSlicePass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
+    ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding,
     ReplaceTransposedConvWithLinearPass,
     ReplaceTrivialConvWithLinear,
     ReplaceWhereWithFullArgsWithWhereScalar,
@@ -2269,3 +2270,48 @@ def test_replace_aten_linalg_svd_with_cadence_linalg_svd(
             count_node(graph_after_passes, exir_ops.edge.cadence.linalg_svd.default),
             1,
         )
+
+    @expand([("dtype",), ("default",)])
+    @torch.no_grad()
+    def test_replace_quantized_embedding(
+        self,
+        name: str,
+    ) -> None:
+        embedding = torch.ones(5, 6, dtype=torch.int8)
+        indices = torch.tensor([0, 2], dtype=torch.int32)
+        scales = torch.ones(5, 2, dtype=torch.float32)
+        zero_points = None
+
+        original_gm = single_op_builder(
+            placeholders=(embedding, scales, indices),
+            op=(
+                exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+                if name == "dtype"
+                else exir_ops.edge.quantized_decomposed.embedding_byte.default
+            ),
+            args=(embedding, scales, zero_points, -128, 127, indices),
+            kwargs={"dtype": torch.float32} if name == "dtype" else {},
+        )
+
+        p = ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding()
+        graph_after_passes = cast(PassResult, p(original_gm)).graph_module
+
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                (
+                    exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+                    if name == "dtype"
+                    else exir_ops.edge.quantized_decomposed.embedding_byte.default
+                ),
+            ),
+            0,
+        )
+
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                exir_ops.edge.cadence.quantized_embedding_byte.default,
+            ),
+            1,
+        )
diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py
index a09f3578391..e49cf412c19 100644
--- a/backends/cadence/utils/facto_util.py
+++ b/backends/cadence/utils/facto_util.py
@@ -222,6 +222,34 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Value.Le(lambda deps, dtype, struct: 2),
                 ]
             )
+        case "transpose_copy.int":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
+        case "permute_copy.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int8, torch.uint8]),
+                    cp.Rank.Le(
+                        lambda deps: 5
+                    ),  # xa_nn_transpose only supports up to 5D
+                    cp.Rank.Ge(lambda deps: 1),  # Must have at least 1 dimension
+                ]
+            )
+        case "sqrt.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
+        case "clamp.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
         case "rsqrt.default":
             tensor_constraints.extend(
                 [
@@ -232,6 +260,12 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Value.Le(lambda deps, dtype, struct: 2**2),
                 ]
             )
+        case "relu.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
+                ]
+            )
         case "mean.dim":
             tensor_constraints.extend(
                 [
@@ -241,10 +275,17 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
         case "exp.default":
             tensor_constraints.extend(
                 [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
                     cp.Value.Ge(lambda deps, dtype, struct: -(2**2)),
                     cp.Value.Le(lambda deps, dtype, struct: 2**2),
                 ]
             )
+        case "tanh.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
+                ]
+            )
         case "slice_copy.Tensor":
             tensor_constraints.extend(
                 [
@@ -253,6 +294,34 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Value.Le(lambda deps, dtype, struct: 2),
                 ]
             )
+        case "div.Scalar" | "add.Tensor" | "mul.Tensor" | "sub.Tensor":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(
+                        lambda deps: [
+                            torch.int32,
+                            torch.int64,
+                            torch.float32,
+                        ]
+                    ),
+                ]
+            )
+        case "split_copy.Tensor":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(
+                        lambda deps: [
+                            torch.int32,
+                            torch.int64,
+                            torch.float32,
+                        ]
+                    ),
+                    cp.Value.Ge(lambda deps, dtype, struct: 1),
+                    cp.Value.Le(lambda deps, dtype, struct: 2**3),
+                    cp.Rank.Le(lambda deps: 3),
+                    cp.Size.Le(lambda deps, r, d: 2**2),
+                ]
+            )
         case "constant_pad_nd.default":
             tensor_constraints.extend(
                 [
@@ -283,6 +352,12 @@ def random_size_constraint(deps: object, r: int, d: int) -> int:
                     cp.Rank.Le(lambda deps: 2**2),
                 ]
             )
+        case "pow.Tensor_Scalar":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
         case "div.Tensor_mode" | "minimum.default":
             if index == 0:
                 tensor_constraints = [
diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
index 833606fb651..daffecda1bf 100644
--- a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
@@ -31,25 +31,24 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    impl::vision::native::kernels::dequantize<uint8_t>(
+    kernels::dequantize<uint8_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    impl::vision::native::kernels::dequantize<int8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (
       input.scalar_type() == ScalarType::Bits16 ||
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-    impl::vision::native::kernels::dequantize<uint16_t>(
+    kernels::dequantize<uint16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
-    impl::vision::native::kernels::dequantize<int16_t>(
+    kernels::dequantize<int16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Int) {
     const int32_t* input_data = input.const_data_ptr<int32_t>();
-    impl::vision::native::kernels::dequantize<int32_t>(
+    kernels::dequantize<int32_t>(
         out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
index 8d209af24b1..cd72d2de2b5 100644
--- a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
@@ -33,25 +33,25 @@ void quantize_per_tensor_out(
 
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    impl::vision::native::kernels::quantize<uint8_t>(
+    kernels::quantize<uint8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
-    impl::vision::native::kernels::quantize<int8_t>(
+    kernels::quantize<int8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (
       out.scalar_type() == ScalarType::Bits16 ||
       out.scalar_type() == ScalarType::UInt16) {
     uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-    impl::vision::native::kernels::quantize<uint16_t>(
+    kernels::quantize<uint16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Short) {
     int16_t* out_data = out.mutable_data_ptr<int16_t>();
-    impl::vision::native::kernels::quantize<int16_t>(
+    kernels::quantize<int16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Int) {
     int32_t* out_data = out.mutable_data_ptr<int32_t>();
-    impl::vision::native::kernels::quantize<int32_t>(
+    kernels::quantize<int32_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
index 6ffb36aa836..1e1e6c8cdc7 100644
--- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -141,8 +141,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_plane[_oh * ow + _ow] =
-                  ::impl::vision::native::kernels::quantize<OT>(
-                      val, inv_out_scale, out_zero_point);
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }
@@ -267,8 +266,8 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic(
             }
             if (quantized) {
               float val = bias_scale * acc;
-              out_line[_oc] = ::impl::vision::native::kernels::quantize<OT>(
-                  val, inv_out_scale, out_zero_point);
+              out_line[_oc] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_line[_oc] = acc;
             }
@@ -530,6 +529,80 @@ void quantized_conv_per_tensor_out(
   }
 }
 
+void quantized_conv2d_nchw_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      false, // channel_last = false for NCHW
+      out);
+}
+
+void quantized_conv2d_nhwc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      true, // channel_last = true for NHWC
+      out);
+}
+
 } // namespace native
 } // namespace vision
 } // namespace impl
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
index e2963bdcffe..58ca33c6a0b 100644
--- a/backends/cadence/vision/operators/op_softmax.cpp
+++ b/backends/cadence/vision/operators/op_softmax.cpp
@@ -6,13 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <api.h>
 #include <executorch/backends/cadence/vision/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <idma_init.h>
+#include <include/api.h>
+#include <include_private/idma_init.h>
 #include <stdio.h>
 
 using executorch::aten::ScalarType;
diff --git a/backends/cadence/vision/operators/quantized_ops.h b/backends/cadence/vision/operators/quantized_ops.h
index b42e45b0b3d..a7251724c53 100644
--- a/backends/cadence/vision/operators/quantized_ops.h
+++ b/backends/cadence/vision/operators/quantized_ops.h
@@ -49,7 +49,7 @@ inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
             (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
         sum += x * w;
       }
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
+      out_data[i * out_dim + j] = impl::vision::kernels::quantize<T>(
           sum, requant_scale, out_zero_point);
     }
   }
@@ -121,8 +121,8 @@ inline __attribute__((always_inline)) void quantized_linear_per_channel_(
       // Compute the out_scale from out_multiplier and out_shift
       const float out_scale =
           -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]);
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
-          sum, out_scale, out_zero_point);
+      out_data[i * out_dim + j] =
+          impl::vision::kernels::quantize<T>(sum, out_scale, out_zero_point);
     }
   }
 }
diff --git a/backends/cadence/vision/operators/targets.bzl b/backends/cadence/vision/operators/targets.bzl
index b12118a9c47..2dd47e12bd2 100644
--- a/backends/cadence/vision/operators/targets.bzl
+++ b/backends/cadence/vision/operators/targets.bzl
@@ -21,6 +21,25 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
     if deps == None:
         deps = []
 
+    # Determine which headers to export based on operator name
+    exported_headers = ["operators.h"]
+    
+    # Add quantized_ops.h header for quantized operators
+    quantized_ops = [
+        "quantized_fully_connected_out",
+        "quantized_matmul_out", 
+        "quantized_layer_norm",
+        "quantized_relu_out",
+        "quantized_conv_out",
+        "quantized_linear_out",
+        "quantize_per_tensor",
+        "dequantize_per_tensor",
+        "requantize_out"
+    ]
+    
+    if name in quantized_ops:
+        exported_headers.append("quantized_ops.h")
+
     runtime.cxx_library(
         name = op_name,
         srcs = [op_name + ".cpp"],
@@ -31,7 +50,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         ],
         compatible_with = ["ovr_config//cpu:xtensa"],
         deps = deps + common_deps,
-        exported_headers = ["operators.h"],
+        exported_headers = exported_headers,
     )
 
 OPERATORS = [
diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h
index ee0666842fd..841a39cf891 100644
--- a/backends/cadence/vision/third-party/include_private/idma_init.h
+++ b/backends/cadence/vision/third-party/include_private/idma_init.h
@@ -1,31 +1,36 @@
 #ifndef __IDMA__INIT_H__
 #define __IDMA__INIT_H__
 
-#include "dtypes.h"
+#include "../include/dtypes.h"
 #include "common.h"
 
-#define IDMA_BUFF_SIZE 16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
+#define IDMA_BUFF_SIZE \
+  16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
 
 #ifndef PLACE_IN_DRAM0
-	#define PLACE_IN_DRAM0 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram0.data")))
+#define PLACE_IN_DRAM0 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data")))
 #endif
 
 #ifndef PLACE_IN_DRAM1
-	#define PLACE_IN_DRAM1 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram1.data")))
+#define PLACE_IN_DRAM1 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data")))
 #endif
 
 float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0;
 float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1;
 
-float32_t *inpData[2] = {&data_dram0[0], &data_dram1[0]};
-float32_t *outData[2] = {&data_dram0[IDMA_BUFF_SIZE / 4], &data_dram1[IDMA_BUFF_SIZE / 4]};
+float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]};
+float32_t* outData[2] = {
+    &data_dram0[IDMA_BUFF_SIZE / 4],
+    &data_dram1[IDMA_BUFF_SIZE / 4]};
 
 IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC);
 IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC);
 
-idma_buffer_t * descbuf[] = {
-  buffer_idma_ch0,
-  buffer_idma_ch1,
+idma_buffer_t* descbuf[] = {
+    buffer_idma_ch0,
+    buffer_idma_ch1,
 };
 
-#endif // __IDMA__INIT_H__
\ No newline at end of file
+#endif // __IDMA__INIT_H__
diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
index 413b6f10567..27487c75d6c 100644
--- a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
+++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
@@ -63,33 +63,33 @@ y[N]   result, Q7.8 or floating point
 x,y    Must not overlap
 -------------------------------------------------------------------------*/
 
-#define IVP_ADDSN_2X32(b_, c_)                                                 \
-  ({                                                                           \
-    xb_vecN_2x32v a_;                                                          \
-    xb_vecN_2x64w tmp_a_;                                                      \
-    tmp_a_ = IVP_MULN_2X32(b_, 1);                                             \
-    IVP_MULAN_2X32(tmp_a_, c_, 1);                                             \
-    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0);                                         \
-    a_;                                                                        \
+#define IVP_ADDSN_2X32(b_, c_)         \
+  ({                                   \
+    xb_vecN_2x32v a_;                  \
+    xb_vecN_2x64w tmp_a_;              \
+    tmp_a_ = IVP_MULN_2X32(b_, 1);     \
+    IVP_MULAN_2X32(tmp_a_, c_, 1);     \
+    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \
+    a_;                                \
   })
 
 #if !HAVE_VFPU
-DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t *x, int N))
+DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N))
 #else
-void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
+void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
 #if !defined(IVP_MULN_2X32)
 #else
-  const int *pTbl = (const int *)expftbl_Q30;
+  const int* pTbl = (const int*)expftbl_Q30;
 #endif
-  const xb_vecN_2xf32 *restrict pX;
-  xb_vecN_2xf32 *restrict pY;
+  const xb_vecN_2xf32* restrict pX;
+  xb_vecN_2xf32* restrict pY;
   xb_vecN_2xf32 norm, ysum, xmax;
   int n;
   valign al_X, al_R, al_Y;
   if (N < 0)
     return;
   xmax = minusInff.f;
-  pX = (const xb_vecN_2xf32 *)x;
+  pX = (const xb_vecN_2xf32*)x;
   al_X = IVP_LAN_2XF32_PP(pX);
   al_Y = IVP_ZALIGN();
   for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) {
@@ -99,17 +99,17 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(x, al_X, pX,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
-    IVP_MAXNUMN_2XF32T(xmax, xmax, x,
-                       IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+    IVP_LAVN_2XF32_XP(
+        x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_MAXNUMN_2XF32T(
+        xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
   }
 
   xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0);
   __Pragma("no_reorder");
   ysum = 0.f;
-  pX = (const xb_vecN_2xf32 *)x;
-  pY = (xb_vecN_2xf32 *)y;
+  pX = (const xb_vecN_2xf32*)x;
+  pY = (xb_vecN_2xf32*)y;
   al_X = IVP_LAN_2XF32_PP(pX);
   {
     vboolN_2 bnan;
@@ -163,8 +163,8 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
     }
     if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
       xb_vecN_2xf32 x;
-      IVP_LAVN_2XF32_XP(x, al_X, pX,
-                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_LAVN_2XF32_XP(
+          x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
       x = IVP_SUBN_2XF32(x, xmax);
       bnan |= IVP_UNN_2XF32(x, x);
       {
@@ -206,18 +206,18 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
         zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp));
         x = zout;
       }
-      IVP_ADDN_2XF32T(ysum, ysum, x,
-                      IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
-      IVP_SAVN_2XF32_XP(x, al_Y, pY,
-                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_ADDN_2XF32T(
+          ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+      IVP_SAVN_2XF32_XP(
+          x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     }
     IVP_SAPOSN_2XF32_FP(al_Y, pY);
     ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan);
   }
   norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum));
   __Pragma("no_reorder");
-  pX = (const xb_vecN_2xf32 *)y;
-  pY = (xb_vecN_2xf32 *)y;
+  pX = (const xb_vecN_2xf32*)y;
+  pY = (xb_vecN_2xf32*)y;
 
   al_R = IVP_LAN_2XF32_PP(pX);
 
@@ -229,11 +229,11 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(x, al_R, pX,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_LAVN_2XF32_XP(
+        x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     x = IVP_MULN_2XF32(x, norm);
-    IVP_SAVN_2XF32_XP(x, al_Y, pY,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_SAVN_2XF32_XP(
+        x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
   }
   IVP_SAPOSN_2XF32_FP(al_Y, pY);
 
diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
index 0ed5dd22257..f1c6f3d44ae 100644
--- a/backends/cadence/vision/third-party/library/tables/expf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
@@ -42,22 +42,28 @@
    p(order)=p(order)-(sum(p)-2);
 */
 const int32_t ALIGN_2SIMD expftbl_Q30[8] = {
-    234841,    1329551,   10400465,   59570027,
-    257946177, 744260763, 1073741824, 0 /* Padding to allow for vector loads */
+    234841,
+    1329551,
+    10400465,
+    59570027,
+    257946177,
+    744260763,
+    1073741824,
+    0 /* Padding to allow for vector loads */
 };
 
 const union ufloat32uint32 ALIGN_2SIMD
     expfminmax[2] = /* minimum and maximum arguments of expf() input */
     {
         {0xc2ce8ed0}, /*-1.0327893066e+002f */
-        {0x42b17218}  /* 8.8722839355e+001f */
+        {0x42b17218} /* 8.8722839355e+001f */
 };
 
 const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */
 
 const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = {
     {0x3fb8aa3b}, /* 1.4426950216      */
-    {0x32a57060}  /* 1.9259629891e-008 */
+    {0x32a57060} /* 1.9259629891e-008 */
 };
 
 /*
@@ -70,5 +76,10 @@ p(order)=p(order)-(sum(p)-2);
 num2hex(single(p));
 */
 const union ufloat32uint32 ALIGN_2SIMD expftblf[] = {
-    {0x39655635}, {0x3aa24c7a}, {0x3c1eb2d1}, {0x3d633ddb},
-    {0x3e75ff24}, {0x3f317212}, {0x3f800000}};
+    {0x39655635},
+    {0x3aa24c7a},
+    {0x3c1eb2d1},
+    {0x3d633ddb},
+    {0x3e75ff24},
+    {0x3f317212},
+    {0x3f800000}};
diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
index 9b2bf62e6bf..8464ee9f549 100644
--- a/backends/cadence/vision/third-party/library/tables/inff_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
@@ -31,7 +31,7 @@
 #include "dtypes.h"
 
 const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */
-const union ufloat32uint32 plusInff = {0x7f800000};  /* +Inf */
+const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */
 const union ufloat32uint32 realmaxf = {
     0x7f7fffff}; /* maximum floating point number */
 const union ufloat32uint32 realminf = {
diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
index 27c5f437b9a..f165234fce4 100644
--- a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
@@ -27,9 +27,9 @@
 */
 
 /* Portable data types. */
-#include "dtypes.h"
 /* NaN values for single precision routines. */
 #include "nanf_tbl.h"
+#include "dtypes.h"
 
 const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN          */
 const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN               */
diff --git a/backends/cadence/vision/third-party/targets.bzl b/backends/cadence/vision/third-party/targets.bzl
index 6bbb7da8d49..26a097010d5 100644
--- a/backends/cadence/vision/third-party/targets.bzl
+++ b/backends/cadence/vision/third-party/targets.bzl
@@ -16,7 +16,7 @@ def define_common_targets():
             "include/*.h", 
             "include_private/*.h"
         ]),
-        header_namespace = "backends/cadence/vision/third-party",
+        header_namespace = "",
         visibility = [
             "//executorch/backends/cadence/...",
             "@EXECUTORCH_CLIENTS",
@@ -28,7 +28,11 @@ def define_common_targets():
         }),
         compiler_flags = select({
             "DEFAULT": ["-UCOMPILER_XTENSA"],  # Ensure COMPILER_XTENSA is not defined for non-Xtensa builds
-            "ovr_config//cpu:xtensa": ["-DCOMPILER_XTENSA"],
+            "ovr_config//cpu:xtensa": [
+                "-DCOMPILER_XTENSA",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include_private",
+            ],
         }),
         define_static_target = True,
     )
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index 24a34546732..a728584e49c 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -90,6 +90,7 @@ gen_operators_lib(
 install(
   TARGETS cortex_m_kernels cortex_m_ops_lib cmsis-nn
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/backends/cortex_m/ops/
 )
diff --git a/backends/cortex_m/passes/quantized_linear_fusion_pass.py b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
index 8f8a90eec2f..11a49beb2f4 100644
--- a/backends/cortex_m/passes/quantized_linear_fusion_pass.py
+++ b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -19,9 +20,10 @@
 )
 
 from executorch.backends.transforms.utils import create_mutable_buffer, get_param_tensor
+
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
 from torch.fx import Node
 from torch.fx.passes.infra.pass_manager import PassResult
 
@@ -29,7 +31,7 @@
 logger.setLevel(logging.INFO)
 
 
-class QuantizedLinearFusionPass(ExportPass):
+class QuantizedLinearFusionPass(XNNPACKPass):
     """
     Cortex-M backend pass that fuses quantized linear-like patterns.
     Fuses: dequantize -> [linear/addmm/fc_ops] -> quantize
@@ -44,8 +46,7 @@ class QuantizedLinearFusionPass(ExportPass):
     requires_exported_program = True
 
     def __init__(self, exported_program: ExportedProgram):
-        super().__init__()
-        self._exported_program = exported_program
+        super().__init__(exported_program)
         self.nodes_to_erase = []
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh
new file mode 100755
index 00000000000..cc28ac5484a
--- /dev/null
+++ b/backends/cortex_m/test/build_test_runner.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# TODO: More separation from the regular arm executor runner and testing.
+
+set -eu
+
+# Always rebuild executorch in case the cortex-m kernels has been updated.
+script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
+et_root_dir=$(realpath "${script_dir}/../../..")
+build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh"
+${build_executorch}
+
+# Build executor runner with all portable ops selected and semi hosting
+build_dir="${et_root_dir}/arm_test"
+build_executor_runner="${et_root_dir}/backends/arm/scripts/build_executor_runner.sh"
+build_root_test_dir="${et_root_dir}/arm_test/arm_semihosting_executor_runner_corstone-300"
+
+${build_executor_runner} --pte=semihosting --target=ethos-u55-128 --output="${build_root_test_dir}"
diff --git a/backends/cortex_m/test/ops/test_add.py b/backends/cortex_m/test/ops/test_add.py
index 10edacb5a11..b7b0ffcbfbc 100644
--- a/backends/cortex_m/test/ops/test_add.py
+++ b/backends/cortex_m/test/ops/test_add.py
@@ -6,7 +6,11 @@
 
 import torch
 from executorch.backends.arm.test.common import parametrize
-from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
 from executorch.backends.test.suite.operators.test_add import Model, ModelAlpha
 
 
@@ -80,19 +84,19 @@ class CortexMAlphaAdd(ModelAlpha):
     ),
     "self_rank_2_pos": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(0, 1000, 10).reshape((10, 1)),),
+        (ramp_tensor(0, 1000, (10, 1)),),
     ),
     "self_rank_3_neg": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(-100, 0, 8).reshape((2, 2, 2)),),
+        (ramp_tensor(-100, 0, (2, 2, 2)),),
     ),
     "self_rank_4_small": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(-0.1, 0.1, 16).reshape(2, 2, 2, 2),),
+        (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),),
     ),
     "self_rank_5": McuTestCase(
         CortexMSelfAdd(),
-        (torch.linspace(-5, 5, 32).reshape(2, 2, 2, 2, 2),),
+        (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),),
     ),
     "scalar_scalar": McuTestCase(
         CortexMScalarAdd(),
@@ -117,15 +121,15 @@ class CortexMAlphaAdd(ModelAlpha):
     "broadcast_3": McuTestCase(
         CortexMTensorAdd(),
         (
-            torch.linspace(-2, 2, 4).reshape(2, 1, 2, 1),
-            torch.linspace(-5, 5, 4).reshape(1, 2, 1, 2),
+            ramp_tensor(-2, 2, (2, 1, 2, 1)),
+            ramp_tensor(-5, 5, (1, 2, 1, 2)),
         ),
     ),
     "alpha": McuTestCase(
         CortexMAlphaAdd(0.5),
         (
-            torch.linspace(-10, 10, 20).reshape(4, 5),
-            torch.linspace(-20, 20, 20).reshape(4, 5),
+            ramp_tensor(-10, 10, (4, 5)),
+            ramp_tensor(-20, 20, (4, 5)),
         ),
     ),
 }
diff --git a/backends/cortex_m/test/ops/test_linear.py b/backends/cortex_m/test/ops/test_linear.py
new file mode 100644
index 00000000000..a1275352fcf
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_linear.py
@@ -0,0 +1,211 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+
+
+class CortexMMm(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.mm(x, y)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMBmm(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.bmm(x, y)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_bmm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMAddmm(torch.nn.Module):
+    def forward(self, x, y, z, alpha=None, beta=None):
+        return torch.addmm(beta, x, alpha, y, z)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_addmm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMAt(CortexMMm):
+    def forward(self, x, y):
+        return x @ y
+
+
+class CortexMMatmul(CortexMMm):
+    def forward(self, x, y):
+        return torch.matmul(x, y)
+
+
+class CortexMLinear(CortexMMatmul):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.linear = torch.nn.Linear(*args, bias=False)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class CortexMLinearBias(CortexMAddmm):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.linear = torch.nn.Linear(*args, bias=True)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+test_cases = {
+    "mm": McuTestCase(
+        model=CortexMMm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "bmm": McuTestCase(
+        model=CortexMBmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16, 16)),
+            ramp_tensor(0, 10, (1, 16, 16)),
+        ),
+    ),
+    "addmm": McuTestCase(
+        model=CortexMAddmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            2,
+            4,
+        ),
+    ),
+    "addmm_scalars": McuTestCase(
+        model=CortexMAddmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "@-operator": McuTestCase(
+        model=CortexMAt(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "matmul": McuTestCase(
+        model=CortexMMatmul(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "linear_rank1": McuTestCase(
+        model=CortexMLinear(2, 3),
+        example_inputs=(ramp_tensor(-1, 1, (2,)),),
+    ),
+    "linear_rank2_pos": McuTestCase(
+        model=CortexMLinear(8, 3),
+        example_inputs=(ramp_tensor(0, 10, (2, 8)),),
+    ),
+    "linear_rank3_neg": McuTestCase(
+        model=CortexMLinear(5, 3),
+        example_inputs=(ramp_tensor(-40, 0, (4, 2, 5)),),
+    ),
+    "linear_rank4": McuTestCase(
+        model=CortexMLinear(16, 32),
+        example_inputs=(ramp_tensor(-100, 100, (2, 1, 2, 16)),),
+    ),
+    "linear_rank5": McuTestCase(
+        model=CortexMLinear(4, 3),
+        example_inputs=(ramp_tensor(-2, 2, (5, 2, 1, 2, 4)),),
+    ),
+    "linear_bias": McuTestCase(
+        model=CortexMLinearBias(61, 37),
+        example_inputs=(ramp_tensor(0, 10, (8, 61)),),
+    ),
+}
+
+dialect_xfails = {
+    "mm": ("torch.mm ops are currently not quantized", RuntimeError),
+    "bmm": ("torch.bmm ops are currently not quantized", RuntimeError),
+    "addmm": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "matmul": ("torch.matmul ops are currently not quantized", RuntimeError),
+    "@-operator": ("@ ops are currently not quantized", RuntimeError),
+    "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank2_pos": ("name 'int32' is not defined", NameError),
+    "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_bias": ("name 'int32' is not defined", NameError),
+}
+
+
+@parametrize("test_case", test_cases, dialect_xfails)
+def test_dialect_linear(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+
+implementation_xfails = {
+    "mm": ("torch.mm ops are currently not quantized", RuntimeError),
+    "bmm": ("torch.bmm ops are currently not quantized", RuntimeError),
+    "addmm": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "matmul": ("torch.matmul ops are currently not quantized", RuntimeError),
+    "@-operator": ("@ ops are currently not quantized", RuntimeError),
+    "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank2_pos": ("Output 0 does not match reference output.", AssertionError),
+    "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_bias": ("Output 0 does not match reference output.", AssertionError),
+}
+
+
+@parametrize("test_case", test_cases, implementation_xfails)
+def test_implementation_linear(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation()
diff --git a/backends/cortex_m/test/ops/test_mul.py b/backends/cortex_m/test/ops/test_mul.py
new file mode 100644
index 00000000000..a2f13760bf0
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_mul.py
@@ -0,0 +1,131 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import pytest
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+from executorch.backends.test.suite.operators.test_mul import Model
+
+
+class CortexMSelfMul(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def forward(self, x):
+        return x * x
+
+
+class CortexMScalarMul(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMTensorMul(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+test_cases = {
+    "self_scalar": McuTestCase(
+        CortexMSelfMul(),
+        (10.0,),
+    ),
+    "self_rank_1": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-5, 5, (10,)),),
+    ),
+    "self_rank_2_pos": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(0, 1000, (10, 1)),),
+    ),
+    "self_rank_3_neg": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-100, 0, (2, 2, 2)),),
+    ),
+    "self_rank_4_small": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),),
+    ),
+    "self_rank_5": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),),
+    ),
+    "scalar_scalar": McuTestCase(
+        CortexMScalarMul(),
+        (-0.5, 1.0),
+    ),
+    "tensor_scalar": McuTestCase(
+        CortexMScalarMul(),
+        (torch.ones(2, 2), 1.0),
+    ),
+    "scalar_tensor": McuTestCase(
+        CortexMScalarMul(),
+        (1000.0, torch.ones(2, 2)),
+    ),
+    "broadcast_1": McuTestCase(
+        CortexMTensorMul(),
+        (torch.ones(1), torch.ones(2, 2, 2, 2)),
+    ),
+    "broadcast_2": McuTestCase(
+        CortexMTensorMul(),
+        (torch.ones((2, 1, 1, 1)), torch.ones(1)),
+    ),
+    "broadcast_3": McuTestCase(
+        CortexMTensorMul(),
+        (
+            ramp_tensor(-2, 2, (2, 1, 2, 1)),
+            ramp_tensor(-5, 5, (1, 2, 1, 2)),
+        ),
+    ),
+}
+
+
+@pytest.mark.skip(reason="Not implemented yet")
+@parametrize("test_case", test_cases)
+def test_dialect_mul(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+
+@pytest.mark.skip(reason="Not implemented yet")
+@parametrize("test_case", test_cases)
+def test_implementation_mul(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation()
diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py
index 8af31e58cd7..c492d3c8443 100644
--- a/backends/cortex_m/test/tester.py
+++ b/backends/cortex_m/test/tester.py
@@ -8,13 +8,11 @@
 from typing import Any
 
 import torch
-
-from backends.xnnpack.quantizer.xnnpack_quantizer import (
-    get_symmetric_quantization_config,
-    XNNPACKQuantizer,
-)
 from executorch.backends.arm.test.common import get_u55_compile_spec
 from executorch.backends.arm.test.tester.arm_tester import Serialize
+from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
+    QuantizedLinearFusionPass,
+)
 from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import (
     QuantizedOpFusionPass,
 )
@@ -33,6 +31,11 @@
 )
 from executorch.backends.xnnpack._passes import XNNPACKPassManager
 
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
 
 class CortexMQuantize(Quantize):
     def __init__(self):
@@ -44,7 +47,12 @@ def __init__(self):
 class CortexMRunPasses(RunPasses):
     def __init__(self):
         super().__init__(
-            XNNPACKPassManager, pass_list=[QuantizedOpFusionPass, ReplaceQuantNodesPass]
+            XNNPACKPassManager,
+            pass_list=[
+                ReplaceQuantNodesPass,
+                QuantizedLinearFusionPass,
+                QuantizedOpFusionPass,
+            ],
         )
 
 
@@ -98,3 +106,9 @@ def test_implementation(self, qtol=0):
 class McuTestCase:
     model: torch.nn.Module
     example_inputs: tuple[Any]
+
+
+def ramp_tensor(start: int, end: int, shape: tuple[int]) -> torch.Tensor:
+    return torch.linspace(start, end, steps=torch.prod(torch.tensor(shape))).reshape(
+        shape
+    )
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..221291442ec
--- /dev/null
+++ b/backends/cuda/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI CUDA backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+cmake_minimum_required(VERSION 3.29)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# CUDA-specific AOTI functionality
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp runtime/guard.cpp
+    runtime/shims/cuda_guard.cpp
+)
+add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+target_include_directories(
+  aoti_cuda
+  PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(
+  aoti_cuda PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+                   $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+target_link_options(
+  aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+)
+
+# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+target_link_libraries(
+  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
+executorch_target_link_options_shared_lib(aoti_cuda)
+
+if(BUILD_TESTING)
+  # Add runtime
+  add_executable(voxtral_runner tests/voxtral_runner.cpp)
+  target_link_libraries(
+    voxtral_runner PUBLIC aoti_cuda extension_module_static
+                          extension_flat_tensor portable_ops_lib
+  )
+endif()
+
+install(
+  TARGETS aoti_cuda
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
index 3e412b6dc56..fe57f7f1b63 100644
--- a/backends/cuda/TARGETS
+++ b/backends/cuda/TARGETS
@@ -6,6 +6,7 @@ runtime.python_library(
     name = "cuda_backend",
     srcs = [
         "cuda_backend.py",
+        "replace_slice_copy_with_slice.py",
     ],
     visibility = [
         "//executorch/...",
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 49314bed5e6..ef98de29f23 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -7,10 +7,14 @@
 import contextlib
 import os
 import typing
+from enum import Enum
 
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
+from executorch.backends.cuda.replace_slice_copy_with_slice import (
+    ReplaceSliceCopyWithSlicePass,
+)
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import (
@@ -21,7 +25,7 @@
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
 from torch.export.passes import move_to_device_pass
-
+from torch.nn.attention import SDPBackend
 
 # exist fallback operators in et namespace;
 supported_fallback_kernels: Dict[str, Any] = {}
@@ -30,6 +34,10 @@
 missing_fallback_kernels: Set[str] = set()
 
 
+class COMPILE_SPEC_KEYS(Enum):
+    METHOD_NAME = "method_name"
+
+
 # context manager for non-fallback guarantee
 # it will raise exception when generating fallback kernels during aoti compile
 @contextlib.contextmanager
@@ -108,6 +116,9 @@ def preprocess(
         # Move the edge_program from CPU to CUDA for aoti compile
         cuda_edge_program = move_to_device_pass(edge_program, "cuda")
 
+        # replace slice_copy with slice
+        ReplaceSliceCopyWithSlicePass()(cuda_edge_program.graph_module)
+
         edge_program_module = cuda_edge_program.module()
 
         # Grab all input placeholders from the graph
@@ -118,6 +129,8 @@ def preprocess(
                 user_input_placeholders.append(node.meta["val"])
 
         options: dict[str, typing.Any] = {
+            # Better model precision
+            "emulate_precision_casts": True,
             # Embed CUDA kernel binaries directly into the compiled shared object
             "aot_inductor.embed_kernel_binary": True,
             # Do not link against the full PyTorch/libtorch library
@@ -132,7 +145,12 @@ def preprocess(
             "max_autotune_conv_backends": "TRITON",
         }
 
-        with collect_unsupported_fallback_kernels():
+        with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
+            [
+                SDPBackend.MATH  # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
+            ]
+        ), torch.no_grad():
+            # torch._logging.set_logs(post_grad_graphs=True)
             so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
             if len(missing_fallback_kernels) > 0:
                 formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
@@ -146,7 +164,10 @@ def preprocess(
             so_data = f.read()
 
         named_data_store = NamedDataStore()
-        named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob")
+        method_name = CudaBackend.method_name_from_compile_specs(compile_specs)
+        named_data_store.add_named_data(
+            method_name + "_so_blob", so_data, 1, "aoti_cuda_blob"
+        )
 
         # Clean up the generated so file; it has been packaged into the NamdeDataStore
         # pyre-ignorep[6]: Incompatible parameter type
@@ -157,3 +178,30 @@ def preprocess(
             debug_handle_map={},
             data_store_output=named_data_store.get_named_data_store_output(),
         )
+
+    @staticmethod
+    def generate_method_name_compile_spec(
+        method_name: str,
+    ) -> CompileSpec:
+        """
+        Returns the compile spec representing the model compute precision, for additional details
+        please refer to the documentation for ``coremltools.precision``.
+        """
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.METHOD_NAME.value,
+            method_name.encode("utf-8"),
+        )
+
+    @staticmethod
+    def method_name_from_compile_specs(
+        compile_specs: List[CompileSpec],
+    ) -> str:
+        """
+        Returns the method name from the compile specs.
+        """
+        for spec in compile_specs:
+            if spec.key == COMPILE_SPEC_KEYS.METHOD_NAME.value:
+                return spec.value.decode("utf-8")
+        raise RuntimeError(
+            f"Could not find method name in compile specs: {compile_specs}"
+        )
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index d52d7d3d087..64df7b7dcb2 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -15,7 +15,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.export.exported_program import ExportedProgram
 
 
@@ -44,14 +44,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
 
         partition_tags: Dict[str, DelegationSpec] = {}
+        tag = "tag0"
+
         for node in exported_program.graph.nodes:
             if node.op != "call_function":
                 continue
-            tag = "tag0"
             node.meta["delegation_tag"] = tag
-            partition_tags[tag] = self.delegation_spec
+
+        partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
new file mode 100644
index 00000000000..4f16759af35
--- /dev/null
+++ b/backends/cuda/replace_slice_copy_with_slice.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Dict, Iterable, Tuple
+
+import torch
+from executorch.exir.dialects._ops import ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch import fx
+
+
+_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = (
+    torch.ops.aten.slice_copy.Tensor,
+    ops.edge.aten.slice_copy.Tensor,
+)
+
+_SLICE_TARGETS: Dict[
+    torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload
+] = {
+    torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
+    ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
+}
+
+
+class ReplaceSliceCopyWithSlicePass(ExportPass):
+    """Replace non-mutated ``slice_copy`` results with ``slice`` views."""
+
+    def call(self, graph_module: fx.GraphModule) -> PassResult:
+        graph_changed = False
+
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in _SLICE_COPY_TARGETS:
+                continue
+
+            if self._has_blocking_user(node, node.users.keys()):
+                continue
+
+            node.target = _SLICE_TARGETS[node.target]
+            graph_changed = True
+
+        if graph_changed:
+            graph_module.graph.lint()
+            graph_module.recompile()
+
+        return PassResult(graph_module, graph_changed)
+
+    def _has_blocking_user(self, node: fx.Node, users: Iterable[fx.Node]) -> bool:
+        for user in users:
+            if self._is_mutating_user(node, user) or self._is_view_user(node, user):
+                return True
+        return False
+
+    def _is_mutating_user(self, node: fx.Node, user: fx.Node) -> bool:
+        if user.op == "call_method":
+            # Treat in-place tensor methods conservatively as mutations only when the
+            # method name ends with ``_`` which is the PyTorch convention for mutation.
+            return isinstance(user.target, str) and user.target.endswith("_")
+
+        if user.op != "call_function":
+            return False
+
+        target = user.target
+        if not hasattr(target, "_schema"):
+            return False
+
+        schema = target._schema  # pyre-ignore[16]
+        # Positional arguments
+        for index, arg in enumerate(user.args):
+            if arg is node and self._argument_mutates(schema, index):
+                return True
+
+        # Keyword arguments
+        for name, arg in user.kwargs.items():
+            if arg is node and self._argument_mutates(schema, name):
+                return True
+
+        return False
+
+    def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
+        if user.op == "call_method":
+            # Treat tensor methods conservatively and assume they may be view-producing.
+            return True
+
+        if user.op != "call_function":
+            return False
+
+        target = user.target
+        if getattr(target, "is_view", False):
+            for arg in user.args:
+                if arg is node:
+                    return True
+            for arg in user.kwargs.values():
+                if arg is node:
+                    return True
+
+        return False
+
+    def _argument_mutates(
+        self, schema: torch._C.FunctionSchema, key: int | str
+    ) -> bool:
+        arguments = schema.arguments
+        if isinstance(key, int):
+            if key >= len(arguments):
+                return False
+            argument = arguments[key]
+        else:
+            argument = next((arg for arg in arguments if arg.name == key), None)
+            if argument is None:
+                return False
+
+        alias_info = argument.alias_info
+        return bool(alias_info and alias_info.is_write)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 1aa38760e5a..54412269287 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -5,13 +5,17 @@ oncall("executorch")
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
+        "guard.cpp",
+        "shims/cuda_guard.cpp",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
+        "guard.h",
+        "shims/cuda_guard.h",
         "shims/memory.h",
         "shims/tensor_attribute.h",
-        "shims/utils.h",
+        "utils.h",
     ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
@@ -30,3 +34,25 @@ runtime.cxx_library(
         ("cuda", None, "cuda-lazy"),
     ],
 )
+
+runtime.cxx_library(
+    name = "cuda_backend",
+    srcs = [
+        "cuda_backend.cpp",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    # Constructor needed for backend registration.
+    compiler_flags = ["-Wno-global-constructors"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        ":runtime_shims",
+        "//executorch/backends/aoti:aoti_common",
+        "//executorch/runtime/backend:interface",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
new file mode 100644
index 00000000000..805c54ff55c
--- /dev/null
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <dlfcn.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <unistd.h>
+#include <cstdio>
+
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+
+// Include our shim layer headers
+#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+
+namespace executorch::backends::cuda {
+
+#define LOAD_SYMBOL(name, handle)                                \
+  do {                                                           \
+    name = reinterpret_cast<name##Func>(dlsym(handle, #name));   \
+    ET_CHECK_OR_RETURN_ERROR(                                    \
+        name != nullptr, AccessFailed, "Failed to load " #name); \
+  } while (0)
+
+using namespace std;
+using namespace aoti;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Tensor;
+
+class ET_EXPERIMENTAL CudaBackend final
+    : public ::executorch::runtime::BackendInterface {
+ private:
+  Error register_shared_library_functions(void* so_handle) const {
+    LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
+
+    return Error::Ok;
+  }
+
+ public:
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be a empty buffer
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    std::string method_name;
+    for (const CompileSpec& spec : compile_specs) {
+      if (std::strcmp(spec.key, "method_name") == 0) {
+        method_name.assign(
+            static_cast<const char*>(spec.value.buffer),
+            spec.value.nbytes); // no nullptr guarantee, so pass size
+        break;
+      }
+    }
+
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
+
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+    auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
+    ET_CHECK_OR_RETURN_ERROR(
+        aoti_cuda_buffer.ok(),
+        Internal,
+        "Failed to get data for key %s: 0x%x",
+        so_blob_key.c_str(),
+        static_cast<uint32_t>(aoti_cuda_buffer.error()));
+
+    // Generate dynamic temporary file path
+    filesystem::path temp_dir = filesystem::temp_directory_path();
+    filesystem::path so_path =
+        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
+
+    // Create a temporary file
+    ofstream outfile(so_path.c_str(), ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    ET_LOG(
+        Info,
+        "Writing %zu bytes to %s",
+        aoti_cuda_buffer->size(),
+        so_path.c_str());
+
+    outfile.write(
+        static_cast<const char*>(aoti_cuda_buffer->data()),
+        aoti_cuda_buffer->size());
+
+    ET_CHECK_OR_RETURN_ERROR(
+        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+
+    // Finish writing the file to disk
+    outfile.close();
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    ET_CHECK_OR_RETURN_ERROR(
+        so_handle != nullptr,
+        AccessFailed,
+        "Failed to load shared library: %s",
+        dlerror());
+
+    processed->Free();
+
+    // Register all shared library functions
+    ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+
+    ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr));
+
+    ET_LOG(Info, "container_handle = %p", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->so_path = so_path.string();
+    handle->container_handle = container_handle;
+
+    // Create a CUDA stream for asynchronous execution
+    cudaStream_t cuda_stream;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
+    handle->cuda_stream = static_cast<void*>(cuda_stream);
+
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      Span<EValue*> args) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Need to re-register all the symbols from the so_handle hosted by this
+    // CudaBackend instance. The reason is that these symbols are
+    // static/singleton across the whole process. When we share multiple methods
+    // (meaning multiple so_handle) in the same process, we need to re-register
+    // the symbols from the so_handle that is being used in this execution.
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        register_shared_library_functions(handle->so_handle));
+
+    size_t n_inputs;
+    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+
+    size_t n_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &n_outputs);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        n_inputs + n_outputs == args.size(),
+        InvalidArgument,
+        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+        n_inputs,
+        n_outputs,
+        args.size())
+
+    // NOTE: ExecuTorch tensors are always on CPU/host memory
+    // We need to create GPU copies for CUDA kernel execution
+    std::vector<AOTITensorHandle> gpu_inputs(
+        n_inputs); // GPU copies for kernel execution
+    std::vector<AOTITensorHandle> gpu_outputs(
+        n_outputs); // GPU tensors for kernel output
+
+    // Process input tensors: ExecuTorch provides CPU tensors, create GPU
+    // copies
+    for (int i = 0; i < n_inputs; i++) {
+      // Get tensor dimensions and properties from ExecuTorch CPU tensor
+      auto cpu_tensor = &(args[i]->toTensor());
+      auto sizes = cpu_tensor->sizes();
+      auto scalar_type = cpu_tensor->scalar_type();
+
+      // Create GPU tensor with same shape
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_input_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_input_handle);
+
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for input %d",
+          i);
+
+      gpu_inputs[i] = gpu_input_handle;
+
+      // Copy data from CPU to GPU
+      ET_CHECK_OR_RETURN_ERROR(
+          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+          Internal,
+          "Failed to copy input %d from CPU to GPU",
+          i);
+    }
+    // Process output tensors: create GPU counterparts for ExecuTorch CPU
+    // tensors
+    for (int i = 0; i < n_outputs; i++) {
+      // Get output tensor dimensions from ExecuTorch CPU tensor
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = cpu_output_tensor->sizes();
+      auto scalar_type = cpu_output_tensor->scalar_type();
+
+      // Create GPU tensor with same shape for kernel output
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_output_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_output_handle);
+
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for output %d",
+          i);
+
+      gpu_outputs[i] = gpu_output_handle;
+    }
+    // Run AOTI container with GPU tensors
+    AOTIRuntimeError error = AOTInductorModelContainerRun(
+        handle->container_handle,
+        gpu_inputs.data(), // Use GPU input tensors
+        n_inputs,
+        gpu_outputs.data(), // Use GPU output tensors
+        n_outputs,
+        handle->cuda_stream, // Pass the actual CUDA stream
+        nullptr); // proxy_executor_handle can remain nullptr
+
+    ET_CHECK_OR_RETURN_ERROR(
+        error == Error::Ok,
+        Internal,
+        "AOTInductorModelContainerRun failed with error code %d",
+        error);
+
+    // Copy GPU output results back to CPU output tensors
+    for (int i = 0; i < n_outputs; i++) {
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      // For DYNAMIC_BOUND tensors we try to resize
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+          "Error resizing tensor at output index %d",
+          i);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+          "Failed to copy GPU output %d back to CPU",
+          i);
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    if (handle_ == nullptr) {
+      return;
+    }
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Destroy the CUDA stream if it exists
+    if (handle->cuda_stream != nullptr) {
+      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
+      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
+      ET_CHECK_OR_LOG_ERROR(
+          stream_err == cudaSuccess,
+          "Failed to destroy CUDA stream: %s",
+          cudaGetErrorString(stream_err));
+      handle->cuda_stream = nullptr;
+    }
+
+    // NOTE: AOTInductorModelContainerDelete does not work correctly with
+    // multiple .so files. Deleting one container frees shared resources,
+    // which causes segmentation faults when attempting to delete other
+    // containers. As a workaround, we skip explicit container deletion
+    // and defer cleanup to the OS.
+    // TODO(gasoonjia): Find a proper solution for safe container deletion.
+    // AOTInductorModelContainerDelete(handle->container_handle);
+
+    // Now close the shared library
+    if (handle->so_handle != nullptr) {
+      dlclose(handle->so_handle);
+    }
+
+    // Remove the temporary shared library file
+    if (!handle->so_path.empty()) {
+      std::error_code remove_error;
+      std::filesystem::remove(handle->so_path, remove_error);
+      ET_CHECK_OR_LOG_ERROR(
+          !remove_error,
+          "Failed to remove temporary shared library %s: %s",
+          handle->so_path.c_str(),
+          remove_error.message().c_str());
+    }
+
+    delete handle;
+    clear_all_tensors();
+  }
+};
+
+} // namespace executorch::backends::cuda
+
+namespace executorch::backends {
+namespace {
+auto cls = cuda::CudaBackend();
+executorch::runtime::Backend backend{"CudaBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+} // namespace executorch::backends
diff --git a/backends/cuda/runtime/guard.cpp b/backends/cuda/runtime/guard.cpp
new file mode 100644
index 00000000000..674cc6387b3
--- /dev/null
+++ b/backends/cuda/runtime/guard.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+
+namespace {
+// Thread-local stream storage (private to this file)
+thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
+} // namespace
+
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
+  if (device_index == -1) {
+    // Get current device if not specified
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  current_streams_[device_index] = stream;
+  return Error::Ok;
+}
+
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index) {
+  if (device_index == -1) {
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  auto it = current_streams_.find(device_index);
+  if (it != current_streams_.end()) {
+    return it->second;
+  }
+
+  cudaStream_t stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&stream));
+  setCurrentCUDAStream(stream, device_index);
+  return stream;
+}
+
+CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
+    : original_device_index_(other.original_device_index_),
+      current_device_index_(other.current_device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the device
+  other.original_device_index_ = other.current_device_index_;
+}
+
+CUDAGuard::~CUDAGuard() {
+  if (original_device_index_ != current_device_index_) {
+    cudaError_t err = cudaSetDevice(original_device_index_);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "~CUDAGuard: Failed to restore device to %d: %s",
+          original_device_index_,
+          cudaGetErrorString(err));
+    }
+  }
+}
+
+Error CUDAGuard::set_index(DeviceIndex device_index) {
+  int orig_index = -1;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&orig_index));
+
+  original_device_index_ = orig_index;
+  current_device_index_ = device_index;
+
+  if (current_device_index_ != original_device_index_) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaSetDevice(current_device_index_));
+  }
+
+  return Error::Ok;
+}
+
+Result<CUDAGuard> CUDAGuard::create(DeviceIndex device_index) {
+  CUDAGuard guard; // Fixed: Removed () to create a variable, not a function
+  ET_CHECK_OK_OR_RETURN_ERROR(guard.set_index(device_index));
+  return guard;
+}
+
+CUDAStreamGuard::CUDAStreamGuard(CUDAStreamGuard&& other) noexcept
+    : device_guard_(std::move(other.device_guard_)),
+      original_stream_(other.original_stream_),
+      current_stream_(other.current_stream_),
+      device_index_(other.device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the stream
+  other.original_stream_ = other.current_stream_;
+}
+
+CUDAStreamGuard::~CUDAStreamGuard() {
+  // Restore the original stream unless this object was moved-from.
+  // After a move, original_stream_ == current_stream_, which indicates
+  // the moved-from object should not restore.
+  // Note: nullptr is a valid stream value (represents the default stream),
+  // so we must restore even if original_stream_ is nullptr.
+  if (original_stream_ != current_stream_) {
+    Error err = setCurrentCUDAStream(original_stream_, device_index_);
+    if (err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "~CUDAStreamGuard: Failed to restore stream for device %d",
+          device_index_);
+    }
+  }
+}
+
+Error CUDAStreamGuard::set_stream(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    ET_LOG(Error, "Failed to get current stream for device %d", device_index);
+    return result.error();
+  }
+
+  original_stream_ = result.get();
+  current_stream_ = stream;
+  device_index_ = device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(setCurrentCUDAStream(stream, device_index));
+
+  return Error::Ok;
+}
+
+Result<CUDAStreamGuard> CUDAStreamGuard::create(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto guard_result = CUDAGuard::create(device_index);
+  ET_CHECK_OK_OR_RETURN_ERROR(guard_result.error());
+
+  CUDAStreamGuard stream_guard(std::move(guard_result.get()));
+  ET_CHECK_OK_OR_RETURN_ERROR(stream_guard.set_stream(stream, device_index));
+
+  return stream_guard;
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h
new file mode 100644
index 00000000000..3f187000f90
--- /dev/null
+++ b/backends/cuda/runtime/guard.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <cstdint>
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+// Type alias for device index
+using DeviceIndex = int32_t;
+
+/**
+ * Set the current CUDA stream for the specified device.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index (-1 to use current device)
+ * @return Error code indicating success or failure
+ */
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream for the specified device.
+ * If no stream has been set, creates a new stream and sets it as current.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @return Result containing the current stream on success, or an error code on
+ * failure
+ */
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * RAII guard that sets the current CUDA device and restores it on destruction.
+ * This ensures that the device is properly restored even if an exception
+ * occurs.
+ *
+ */
+class CUDAGuard {
+ private:
+  /**
+   * Private constructor - use create() factory method instead.
+   */
+  explicit CUDAGuard()
+      : original_device_index_(-1), current_device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAGuard.
+   *
+   * @param device_index The device index to set as current
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAGuard> create(DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAGuard(CUDAGuard&& other) noexcept;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+
+  /**
+   * Destructor that restores the original device if necessary.
+   */
+  ~CUDAGuard();
+
+  /**
+   * Sets the CUDA device to the given device index.
+   *
+   * @param device_index The device index to set as current
+   * @return Error code indicating success or failure
+   */
+  Error set_index(DeviceIndex device_index);
+
+  /**
+   * Get the original device index before the guard was created.
+   *
+   * @return The original device index
+   */
+  DeviceIndex original_device() const {
+    return original_device_index_;
+  }
+
+  /**
+   * Get the current device index.
+   *
+   * @return The current device index
+   */
+  DeviceIndex current_device() const {
+    return current_device_index_;
+  }
+
+ private:
+  /// The original device before this guard was created
+  DeviceIndex original_device_index_;
+  /// The current device managed by this guard
+  DeviceIndex current_device_index_;
+};
+
+/**
+ * RAII guard that sets the current CUDA device and stream, restoring both on
+ * destruction. This is useful for temporarily switching to a different device
+ * and stream.
+ *
+ */
+class CUDAStreamGuard {
+ private:
+  // Private constructor that takes a CUDAGuard
+  explicit CUDAStreamGuard(CUDAGuard&& guard)
+      : device_guard_(std::move(guard)),
+        original_stream_(nullptr),
+        current_stream_(nullptr),
+        device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAStreamGuard.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAStreamGuard> create(
+      cudaStream_t stream,
+      DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAStreamGuard(CUDAStreamGuard&& other) noexcept;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) noexcept = delete;
+
+  /**
+   * Destructor that restores the original stream and device.
+   */
+  ~CUDAStreamGuard();
+
+  /**
+   * Sets the CUDA stream to the given stream on the specified device.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Error code indicating success or failure
+   */
+  Error set_stream(cudaStream_t stream, DeviceIndex device_index);
+
+  /**
+   * Get the current guarded stream.
+   *
+   * @return The current stream
+   */
+  cudaStream_t stream() const {
+    return current_stream_;
+  }
+
+  /**
+   * Get the device index being guarded.
+   *
+   * @return The device index
+   */
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+ private:
+  /// The device guard that handles device switching
+  CUDAGuard device_guard_;
+  /// The original stream that was current before this guard
+  cudaStream_t original_stream_ = nullptr;
+  /// The current stream being guarded
+  cudaStream_t current_stream_ = nullptr;
+  /// The device index for this stream guard
+  DeviceIndex device_index_;
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.cpp b/backends/cuda/runtime/shims/cuda_guard.cpp
new file mode 100644
index 00000000000..bb07acc7ffa
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
+
+namespace executorch::backends::cuda {
+
+extern "C" {
+
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_guard failed: ret_guard is null");
+
+  auto result = CUDAGuard::create(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_guard_set_index failed: guard is null");
+
+  ET_CHECK_OK_OR_RETURN_ERROR(guard->set_index(device_index));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: ret_guard is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: stream is null");
+
+  auto result =
+      CUDAStreamGuard::create(static_cast<cudaStream_t>(stream), device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAStreamGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_stream_guard(
+    CUDAStreamGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_stream_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_get_current_cuda_stream failed: ret_stream is null");
+
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_stream = static_cast<void*>(result.get());
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
new file mode 100644
index 00000000000..f930f3df643
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <cstdint>
+
+namespace executorch::backends::cuda {
+
+using executorch::backends::aoti::AOTITorchError;
+
+extern "C" {
+
+// Handle types for CUDA guards
+using CUDAGuardHandle = CUDAGuard*;
+using CUDAStreamGuardHandle = CUDAStreamGuard*;
+
+/**
+ * Creates a CUDA device guard that sets the current device and restores it
+ * upon destruction.
+ *
+ * @param device_index The device index to set as current
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA device guard and frees its associated resources.
+ *
+ * @param guard Handle to the guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+
+/**
+ * Sets the CUDA device to a new index for an existing guard.
+ *
+ * @param guard Handle to the guard
+ * @param device_index The device index to set as current
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index);
+
+/**
+ * Creates a CUDA stream guard that sets the current device and stream,
+ * restoring both upon destruction.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index for the stream
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA stream guard and frees its associated resources.
+ *
+ * @param guard Handle to the stream guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+/**
+ * Gets the current CUDA stream for a specified device.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @param ret_stream Output parameter for the current stream (must not be null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream);
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 2b32d820301..6fe315ba8ee 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/utils.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
 #include <cstdlib> // For posix_memalign
@@ -19,9 +19,7 @@
 #include <unordered_set>
 #include <vector>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::aten::SizesType;
 using executorch::aten::StridesType;
@@ -227,7 +225,7 @@ AOTITorchError aoti_torch_empty_strided(
 
   if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
     ET_CUDA_CHECK_OR_RETURN_ERROR(
-        cudaMallocManaged(&ptr, static_cast<size_t>(nbytes)));
+        cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
   } else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
     // Ensure 16-byte alignment for CPU memory to match CUDA requirements
     int result = posix_memalign(&ptr, 16, nbytes);
@@ -271,14 +269,21 @@ void clear_all_tensors() {
   // Use aoti_torch_delete_tensor_object to properly delete each tensor
   // Note: We need to collect tensor pointers first since deletion modifies the
   // set
-  auto old_tensors =
-      std::move(tensors); // tensors is now empty and no need to copy
-  for (const auto& tensor_shared : old_tensors) {
-    aoti_torch_delete_tensor_object(tensor_shared.get());
+  std::vector<Tensor*> tensor_ptrs;
+  tensor_ptrs.reserve(tensors.size());
+  for (const auto& tensor_shared : tensors) {
+    tensor_ptrs.push_back(tensor_shared.get());
+  }
+
+  // Now delete each tensor - this will modify the global tensors set
+  for (Tensor* tensor_ptr : tensor_ptrs) {
+    aoti_torch_delete_tensor_object(tensor_ptr);
   }
 
   // tensors set should now be empty, but ensure it's cleared
   tensors.clear();
+
+  ET_LOG(Info, "Cleared all tensors");
 }
 
 AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
@@ -323,11 +328,14 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
           ET_CUDA_CHECK_OR_RETURN_ERROR(
               cudaPointerGetAttributes(&attributes, data_ptr));
 
-          if (attributes.type == cudaMemoryTypeManaged) {
-            // This is CUDA managed memory - free with proper synchronization
-            ET_CUDA_CHECK_OR_RETURN_ERROR(cudaDeviceSynchronize());
-            ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr));
+          if (attributes.type == cudaMemoryTypeDevice) {
+            ET_CUDA_CHECK_OR_RETURN_ERROR(
+                cudaFreeAsync(data_ptr, cudaStreamDefault));
           } else {
+            ET_CHECK_OR_RETURN_ERROR(
+                attributes.type != cudaMemoryTypeManaged,
+                Internal,
+                "Expected host memory but got managed!")
             // This is CPU memory - free immediately
             free(data_ptr);
             data_ptr = nullptr;
@@ -652,6 +660,4 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index bcec6621285..7a8d4c3609b 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -12,9 +12,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::backends::aoti::AOTITorchError;
 using executorch::backends::aoti::Tensor;
@@ -145,6 +143,4 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 void clear_all_tensors();
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.cpp b/backends/cuda/runtime/shims/tensor_attribute.cpp
index 5b640b7a9e8..1a14c79f9f2 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.cpp
+++ b/backends/cuda/runtime/shims/tensor_attribute.cpp
@@ -8,9 +8,7 @@
 
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 extern "C" {
 
@@ -31,6 +29,4 @@ int32_t aoti_torch_device_type_cuda() {
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index e99958b4f0c..15a4e397d24 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -12,9 +12,7 @@
 #include <executorch/runtime/core/error.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 // Common using declarations for ExecutorTorch types
 using executorch::runtime::Error;
@@ -35,6 +33,4 @@ int32_t aoti_torch_device_type_cuda();
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index fcb95a0beb7..70f27b86bec 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -32,3 +32,4 @@ def define_common_targets():
     cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
     cuda_shim_cpp_unittest("aoti_torch__reinterpret_tensor")
     cuda_shim_cpp_unittest("aoti_torch_copy_")
+    cuda_shim_cpp_unittest("aoti_torch_cuda_guard")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
index ef00ecff656..e18bf142b5c 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
index 7579eaef039..9fca0f92cf8 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
index 2cb12719782..d9b785a5a78 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp
new file mode 100644
index 00000000000..7527965cdb8
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::aoti;
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. Will be added in the future.
+class AOTITorchCUDAGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (cudaGetDeviceCount(&original_device_) == cudaSuccess) {
+      ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+    }
+  }
+
+  int original_device_ = 0;
+};
+
+TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAGuard) {
+  CUDAGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  int current_device = -1;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, 0);
+
+  error = aoti_torch_delete_cuda_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAGuardNullReturnPointer) {
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAGuardNullHandle) {
+  AOTITorchError error = aoti_torch_delete_cuda_guard(nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexNullHandle) {
+  AOTITorchError error = aoti_torch_cuda_guard_set_index(nullptr, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexInvalidDevice) {
+  CUDAGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  error = aoti_torch_cuda_guard_set_index(guard, 999);
+  EXPECT_NE(error, Error::Ok);
+
+  error = aoti_torch_delete_cuda_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAStreamGuard) {
+  cudaStream_t stream;
+  ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess);
+
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_stream_guard(stream, 0, &guard);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullReturnPointer) {
+  cudaStream_t stream;
+  ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess);
+
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(stream, 0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullStream) {
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(nullptr, 0, &guard);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAStreamGuardNullHandle) {
+  AOTITorchError error = aoti_torch_delete_cuda_stream_guard(nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStream) {
+  void* ret_stream = nullptr;
+  AOTITorchError error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(ret_stream, nullptr);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStreamNullReturnPointer) {
+  AOTITorchError error = aoti_torch_get_current_cuda_stream(0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, StreamGuardWithSameDevice) {
+  ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
+
+  cudaStream_t stream1, stream2;
+  ASSERT_EQ(cudaStreamCreate(&stream1), cudaSuccess);
+  ASSERT_EQ(cudaStreamCreate(&stream2), cudaSuccess);
+
+  CUDAStreamGuardHandle guard1 = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(stream1, 0, &guard1);
+  EXPECT_EQ(error, Error::Ok);
+
+  void* ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream1);
+
+  CUDAStreamGuardHandle guard2 = nullptr;
+  error = aoti_torch_create_cuda_stream_guard(stream2, 0, &guard2);
+  EXPECT_EQ(error, Error::Ok);
+
+  ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream2);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard2);
+  EXPECT_EQ(error, Error::Ok);
+
+  ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream1);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard1);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(stream1), cudaSuccess);
+  ASSERT_EQ(cudaStreamDestroy(stream2), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentStreamAfterSetStream) {
+  cudaStream_t new_stream;
+  ASSERT_EQ(cudaStreamCreate(&new_stream), cudaSuccess);
+
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(new_stream, 0, &guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  void* ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), new_stream);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(new_stream), cudaSuccess);
+}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
index eceb141e9ca..10c8d8c1a31 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
index 8e6998f457c..da65129f18a 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/tests/TARGETS b/backends/cuda/runtime/tests/TARGETS
new file mode 100644
index 00000000000..9ff3e83a8bd
--- /dev/null
+++ b/backends/cuda/runtime/tests/TARGETS
@@ -0,0 +1,6 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/cuda/runtime/tests/targets.bzl b/backends/cuda/runtime/tests/targets.bzl
new file mode 100644
index 00000000000..37e8d876526
--- /dev/null
+++ b/backends/cuda/runtime/tests/targets.bzl
@@ -0,0 +1,27 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+
+def cuda_runtime_cpp_unittest(name):
+    cpp_unittest(
+        name = "test_" + name,
+        srcs = [
+            "test_" + name + ".cpp",
+        ],
+        deps = [
+            "//executorch/backends/cuda/runtime:runtime_shims",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/platform:platform",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    cuda_runtime_cpp_unittest("cuda_guard")
+    cuda_runtime_cpp_unittest("cuda_stream_guard")
diff --git a/backends/cuda/runtime/tests/test_cuda_guard.cpp b/backends/cuda/runtime/tests/test_cuda_guard.cpp
new file mode 100644
index 00000000000..a364ae98484
--- /dev/null
+++ b/backends/cuda/runtime/tests/test_cuda_guard.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+};
+
+TEST_F(CUDAGuardTest, BasicDeviceSwitching) {
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int device_after_guard;
+    ASSERT_EQ(cudaGetDevice(&device_after_guard), cudaSuccess);
+    EXPECT_EQ(device_after_guard, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), current_device);
+  }
+
+  int device_after_destruction;
+  ASSERT_EQ(cudaGetDevice(&device_after_destruction), cudaSuccess);
+  EXPECT_EQ(device_after_destruction, current_device);
+}
+
+TEST_F(CUDAGuardTest, SameDeviceNoSwitching) {
+  ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int current_device;
+    ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), 0);
+  }
+
+  int final_device;
+  ASSERT_EQ(cudaGetDevice(&final_device), cudaSuccess);
+  EXPECT_EQ(final_device, 0);
+}
+
+TEST_F(CUDAGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAGuard::create(999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAGuard::create(-2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAGuard>,
+      "CUDAGuard should not be copy constructible");
+}
+
+TEST_F(CUDAGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be copy assignable");
+}
+
+TEST_F(CUDAGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be move assignable");
+}
diff --git a/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp
new file mode 100644
index 00000000000..68a050a69be
--- /dev/null
+++ b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAStreamGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+
+    ASSERT_EQ(cudaStreamCreate(&test_stream1_), cudaSuccess);
+    ASSERT_EQ(cudaStreamCreate(&test_stream2_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (test_stream1_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream1_), cudaSuccess);
+    }
+    if (test_stream2_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream2_), cudaSuccess);
+    }
+
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+  cudaStream_t test_stream1_ = nullptr;
+  cudaStream_t test_stream2_ = nullptr;
+};
+
+TEST_F(CUDAStreamGuardTest, BasicStreamSwitching) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, 0);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamSwitchingOnSameDevice) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream2_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto new_stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(new_stream_result.ok());
+    EXPECT_EQ(new_stream_result.get(), test_stream2_);
+    EXPECT_EQ(guard.stream(), test_stream2_);
+  }
+
+  auto restored_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(restored_stream_result.ok());
+  EXPECT_EQ(restored_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, NestedStreamGuards) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+
+    {
+      auto guard2_result = CUDAStreamGuard::create(test_stream2_, 0);
+      ASSERT_TRUE(guard2_result.ok());
+      CUDAStreamGuard guard2 = std::move(guard2_result.get());
+
+      auto stream_result2 = getCurrentCUDAStream(0);
+      ASSERT_TRUE(stream_result2.ok());
+      EXPECT_EQ(stream_result2.get(), test_stream2_);
+    }
+
+    auto stream_result3 = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result3.ok());
+    EXPECT_EQ(stream_result3.get(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, SameStreamNoChange) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+    EXPECT_EQ(guard.stream(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamAccessor) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+}
+
+TEST_F(CUDAStreamGuardTest, SetStreamMethod) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+
+  Error err = guard.set_stream(test_stream2_, 0);
+  EXPECT_EQ(err, Error::Ok);
+
+  EXPECT_EQ(guard.stream(), test_stream2_);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream2_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructor) {
+  auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard1_result.ok());
+  CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+  EXPECT_EQ(guard1.stream(), test_stream1_);
+  EXPECT_EQ(guard1.device_index(), 0);
+
+  CUDAStreamGuard guard2 = std::move(guard1);
+
+  EXPECT_EQ(guard2.stream(), test_stream1_);
+  EXPECT_EQ(guard2.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructorRestoresOnlyOnce) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    { CUDAStreamGuard guard2 = std::move(guard1); }
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), initial_stream);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, -2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy constructible");
+}
+
+TEST_F(CUDAStreamGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be move assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, NullStreamPointer) {
+  auto guard_result = CUDAStreamGuard::create(nullptr, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), nullptr);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+}
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/utils.h
similarity index 92%
rename from backends/cuda/runtime/shims/utils.h
rename to backends/cuda/runtime/utils.h
index 99d2bc102f5..2d805724090 100644
--- a/backends/cuda/runtime/shims/utils.h
+++ b/backends/cuda/runtime/utils.h
@@ -34,12 +34,11 @@
 #define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \
   ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError())
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 // Enum for supported data types in et-cuda backend
 enum class SupportedDTypes : int32_t {
+  INT64 = 4, // PyTorch's int64 dtype code
   FLOAT32 = 6, // PyTorch's float32 dtype code
   BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
 };
@@ -100,6 +99,7 @@ using AOTITorchError = Error;
 // Helper function to check if a dtype is supported in ET CUDA backend
 inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
   switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::INT64):
     case static_cast<int32_t>(SupportedDTypes::FLOAT32):
     case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
       return true;
@@ -113,8 +113,9 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
   ET_CHECK_OR_RETURN_ERROR(
       is_dtype_supported_in_et_cuda(dtype),
       InvalidArgument,
-      "Unsupported dtype: %d. Supported dtypes: %d (float32), %d (bfloat16)",
+      "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)",
       dtype,
+      static_cast<int32_t>(SupportedDTypes::INT64),
       static_cast<int32_t>(SupportedDTypes::FLOAT32),
       static_cast<int32_t>(SupportedDTypes::BFLOAT16));
 
@@ -122,6 +123,4 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
 }
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
index 99f8d33a766..d794a4f042c 100644
--- a/backends/cuda/tests/test_cuda_export.py
+++ b/backends/cuda/tests/test_cuda_export.py
@@ -8,6 +8,7 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from torch.export import export
@@ -30,7 +31,9 @@ def _export_to_cuda_with_lower(
         exported_program = export(module, inputs, strict=True)
 
         # Create partitioner and compile specs
-        partitioner = CudaPartitioner([])
+        partitioner = CudaPartitioner(
+            [CudaBackend.generate_method_name_compile_spec("forward")]
+        )
 
         # Use to_edge_transform_and_lower for complete pipeline
         edge_program_manager = to_edge_transform_and_lower(
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
new file mode 100644
index 00000000000..feed458e1f5
--- /dev/null
+++ b/backends/cuda/tests/voxtral_runner.cpp
@@ -0,0 +1,264 @@
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+namespace {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using executorch::extension::module::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using Clock = std::chrono::steady_clock;
+using DurationMs = std::chrono::duration<double, std::milli>;
+
+std::vector<executorch::aten::SizesType> to_sizes(
+    std::initializer_list<int64_t> dims) {
+  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
+}
+
+std::string format_shape(const Tensor& tensor) {
+  std::ostringstream oss;
+  oss << "[";
+  const auto& sizes = tensor.sizes();
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    if (i > 0) {
+      oss << ", ";
+    }
+    oss << sizes[i];
+  }
+  oss << "]";
+  return oss.str();
+}
+
+void print_tensor_summary(const std::string& label, const Tensor& tensor) {
+  std::cout << "    " << label
+            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
+            << ", shape=" << format_shape(tensor)
+            << ", numel=" << tensor.numel() << std::endl;
+}
+
+TensorPtr create_audio_input() {
+  const auto sizes = to_sizes({3, 128, 3000});
+  const size_t numel = 3ull * 128ull * 3000ull;
+  std::vector<float> data(numel, 0.5f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+TensorPtr create_token_ids_input() {
+  const auto sizes = to_sizes({1, 1138});
+  std::vector<int64_t> data(static_cast<size_t>(1) * 1138, 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_positions_input() {
+  const auto sizes = to_sizes({1138});
+  std::vector<int64_t> data(static_cast<size_t>(1138), 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_fallback_text_embedding() {
+  const auto sizes = to_sizes({1, 1138, 3072});
+  const size_t numel = 1ull * 1138ull * 3072ull;
+  std::vector<float> data(numel, 0.0f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+struct MethodTiming {
+  double load_ms{0.0};
+  double run_ms{0.0};
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0]
+              << " <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
+              << std::endl;
+    return 1;
+  }
+
+  const std::string program_path = argv[1];
+  const std::string data_map_path = argv[2];
+
+  try {
+    Module module(program_path, data_map_path);
+
+    const auto program_load_start = Clock::now();
+    const Error program_load_error = module.load();
+    const auto program_load_end = Clock::now();
+    if (program_load_error != Error::Ok) {
+      std::cerr << "Failed to load ExecuTorch program: error code "
+                << static_cast<int>(program_load_error) << std::endl;
+      return 1;
+    }
+    const DurationMs program_load_latency =
+        program_load_end - program_load_start;
+
+    MethodTiming audio_timing;
+    MethodTiming token_timing;
+    MethodTiming text_timing;
+
+    auto measure_method_load =
+        [&](const std::string& name) -> std::pair<Error, double> {
+      const auto start = Clock::now();
+      const Error err = module.load_method(name);
+      const auto end = Clock::now();
+      return {err, DurationMs(end - start).count()};
+    };
+
+    // audio_encoder
+    {
+      const auto [err, load_ms] = measure_method_load("audio_encoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method audio_encoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      audio_timing.load_ms = load_ms;
+
+      const TensorPtr audio_input = create_audio_input();
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(audio_input);
+      inputs.emplace_back(*audio_input);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("audio_encoder", inputs);
+      const auto run_end = Clock::now();
+      audio_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "audio_encoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("audio_encoder output", outputs[0].toTensor());
+      }
+    }
+
+    EValue token_output;
+    bool token_executed = false;
+
+    // token_embedding
+    {
+      const auto [err, load_ms] = measure_method_load("token_embedding");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method token_embedding: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      token_timing.load_ms = load_ms;
+
+      const TensorPtr token_ids = create_token_ids_input();
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(token_ids);
+      inputs.emplace_back(*token_ids);
+
+      const auto run_start = Clock::now();
+      auto token_output_result = module.execute("token_embedding", inputs);
+      const auto run_end = Clock::now();
+      token_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (token_output_result.error() != Error::Ok) {
+        std::cerr << "token_embedding execution failed: error code "
+                  << static_cast<int>(token_output_result.error()) << std::endl;
+        return 1;
+      }
+
+      token_executed = true;
+      const auto& outputs = token_output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("token_embedding output", outputs[0].toTensor());
+        token_output = outputs[0];
+      }
+    }
+
+    // text_decoder
+    {
+      const auto [err, load_ms] = measure_method_load("text_decoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method text_decoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      text_timing.load_ms = load_ms;
+
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      if (token_executed) {
+        if (token_output.isTensor()) {
+          inputs.emplace_back(token_output);
+        }
+      }
+
+      if (inputs.empty()) {
+        auto fallback_embedding = create_fallback_text_embedding();
+        owned_inputs.emplace_back(fallback_embedding);
+        inputs.emplace_back(*fallback_embedding);
+      }
+
+      auto positions = create_positions_input();
+      owned_inputs.emplace_back(positions);
+      inputs.emplace_back(*positions);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("text_decoder", inputs);
+      const auto run_end = Clock::now();
+      text_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "text_decoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("text_decoder output", outputs[0].toTensor());
+      }
+    }
+
+    std::cout << std::fixed << std::setprecision(3);
+    std::cout << "Program load latency (ms): " << program_load_latency.count()
+              << std::endl;
+
+    std::cout << "Method load latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.load_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.load_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.load_ms << std::endl;
+
+    std::cout << "Run latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.run_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.run_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.run_ms << std::endl;
+
+    return 0;
+  } catch (const std::exception& ex) {
+    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index ed9b37e1998..10c28be0053 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -46,5 +46,5 @@ executorch_target_link_options_shared_lib(neuron_backend)
 install(
   TARGETS neuron_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md
index e8a535b3fde..6ff751f8408 100644
--- a/backends/mediatek/README.md
+++ b/backends/mediatek/README.md
@@ -28,7 +28,7 @@ To get started with MediaTek's ExecuTorch libraries, download the [NeuroPilot Ex
 
 - **`mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`**: This library preprocesses the model into a MediaTek representation.
 
-- **`mtk_neuron-8.2.19-py3-none-linux_x86_64.whl`**: This library converts the model to binaries.
+- **`mtk_neuron-8.2.23-py3-none-linux_x86_64`**: This library converts the model to binaries.
 
 Additionally, make sure to copy `NeuronAdapter.h` to the following directory: `backends/mediatek/runtime/include/api/`.
 
@@ -45,7 +45,7 @@ Follow the steps below to setup your build environment:
    ```
 - Install the two .whl downloaded from NeuroPilot Portal
    ```bash
-   pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+   pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
    pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
    ```
 
diff --git a/backends/mediatek/scripts/mtk_build.sh b/backends/mediatek/scripts/mtk_build.sh
index 599f754d7bc..d42e5f7e10a 100755
--- a/backends/mediatek/scripts/mtk_build.sh
+++ b/backends/mediatek/scripts/mtk_build.sh
@@ -30,6 +30,7 @@ cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
       -DEXECUTORCH_BUILD_NEURON=ON \
       -B"${build_dir}"
 
diff --git a/backends/nxp/CMakeLists.txt b/backends/nxp/CMakeLists.txt
index 43fcaa24d19..bfc4c046be6 100644
--- a/backends/nxp/CMakeLists.txt
+++ b/backends/nxp/CMakeLists.txt
@@ -17,5 +17,5 @@ target_include_directories(
 install(
   TARGETS executorch_delegate_neutron
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/nxp/README.md b/backends/nxp/README.md
index 10eb1290a8b..de41cdd282e 100644
--- a/backends/nxp/README.md
+++ b/backends/nxp/README.md
@@ -15,7 +15,8 @@ networks, as well as the ability to adapt and scale to new model architectures,
 to AI workloads. ML application development with the eIQ Neutron NPU is fully supported by the 
 [eIQ machine learning software development environment](https://www.nxp.com/design/design-center/software/eiq-ml-development-environment/eiq-toolkit-for-end-to-end-model-development-and-deployment:EIQ-TOOLKIT).
 The eIQ AI SW Stack provides a streamlined development experience for developers and end-users of NXP products.
-eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices.
+eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers 
+to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices.
 
 
 ## Supported NXP platforms
@@ -35,37 +36,28 @@ improvements. NXP and the ExecuTorch community is actively developing this codeb
 
 ## Neutron Backend implementation and SW architecture
 Neutron Backend uses the eIQ Neutron Converter as ML compiler to compile the delegated subgraph to Neutron microcode. 
-The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler. 
-
-The Neutron Backend in its early prototype phase, is based on existing NXP products, such as 
-onnx2tflite, known from the NXP's eIQ Toolkit. 
-The **onnx2tflite** is a converter from the ONNX format to LiteRT (formerly known as TFLite).
-It consists of 3 stages: 
-* ONNX Model Parsing
-* Tensor Format Inference, to identify tensors using channel-first layer
-* ONNX to LiteRT Conversion 
-* Optimization Passes, which operate on top of the LiteRT format
-* LiteRT Serialization 
-
-Due to the similarities between ONNX to LiteRT and Edge to LiteRT conversion, the Neutron Backend's 
-currently leverages the Tensor format Inference and LiteRT Optimizer. 
-This shall be considered as temporary solution, intended to be replaced with: 
-* Dim Order (https://github.com/pytorch/executorch/issues/4873)
-* Corresponding ExecuTorch/ATen passes
-
-before reaching higher maturity status by the end of 2025. 
+The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend
+uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler.
 
 ## Layout
-The current code base is as follows:
 * `backend/ir/` - TFLite/LiteRT based IR to represent the Edge Subgraph, taken from onnx2tflite code base and extended to
   support Edge Dialect to LiteRT conversion.
     * `backend/ir/converter` - Neutron Backends conversion from Edge (ATen) Dialect to LiteRT, TFLite. The subfolder
       `node_conveters` is structured as single module for each Edge operator.
-    * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema
+    * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema.
     * `backend/ir/tflite_generator` and `backend/ir/tflite_optimizer` handle the serialization
        of the in-memory built subgraph for delegation into LiteRT/TFLite flatbuffers 
        representation. Code taken from the onnx2tflite tool.
-*  `quantizer` - Neutron Backends quantizer implementation. 
+*  `edge_passes` - Various passes operating on Edge dialect level. 
+*  `quantizer` - Neutron Backend quantizer implementation. 
+*  `runtime` - Neutron Backend runtime implementation. For running compiled on device.
+*  `tests/` - Unit tests for Neutron backend.
+    * `tests/converter/node_converter` - Operator level unit tests.
+
+* `examples/nxp/` - Example models and scripts for running them.
+
+## Examples
+Please see this [README.md](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md).
 
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index 192798c151e..fcfb9787715 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -18,6 +18,7 @@
 from torch.fx import Node
 from torch.nn.parameter import Parameter
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.backend.node_format_inference import (
     NodeFormat,
     NodeFormatInference,
@@ -33,6 +34,7 @@
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
+    exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
@@ -42,6 +44,7 @@
     exir_ops.edge.aten.permute_copy.default: PermuteCopyConverter,  # noqa F405
     exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
     exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.sub.Tensor: SubTensorConverter,  # noqa F405
     exir_ops.edge.aten.tanh.default: TanhConverter,  # noqa F405
     exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
     exir_ops.edge.aten.sigmoid.default: SigmoidConverter,  # noqa F405
@@ -54,12 +57,14 @@ class EdgeProgramToIRConverter:
     """
 
     _default_conversion_config = ConversionConfig()
+    _default_target_spec = NeutronTargetSpec("imxrt700", "SDK_25_09")
     _default_delegation_options = CustomDelegationOptions()
 
     def convert_program(
         self,
         edge_program: ExportedProgram,
-        conversion_config=_default_conversion_config,
+        conversion_config: ConversionConfig = _default_conversion_config,
+        neutron_target_spec: NeutronTargetSpec = _default_target_spec,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
     ) -> (bytes, dict):
         """
@@ -67,6 +72,7 @@ def convert_program(
 
         :param edge_program: Converter ExportedProgram.
         :param conversion_config: ConversionConfig instance.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param custom_delegation_options: Custom user options which affect node delegation.
         :return: TFLite flatbuffers as bytes.
         """
@@ -76,6 +82,7 @@ def convert_program(
         cc = self.build_conversion_context(
             parameters_mapping,
             node_formats,
+            neutron_target_spec,
             conversion_config,
             custom_delegation_options,
         )
@@ -173,11 +180,12 @@ def map_inputs_to_parameters(edge_program: ExportedProgram) -> dict[str, Paramet
     def build_conversion_context(
         parameters_mapping: dict,
         node_formats: dict[Node, NodeFormat],
+        neutron_target_spec: NeutronTargetSpec,
         conversion_config: ConversionConfig = _default_conversion_config,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
     ) -> ConversionContext:
         tflite_builder = AtenModelBuilderDirector(
-            3, "TFLite from EdgeProgram", conversion_config
+            3, "TFLite from EdgeProgram", neutron_target_spec, conversion_config
         )
 
         # Add "sentinel" buffer (defined in schema.fbs)
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
index 496fa752853..643a6231d15 100755
--- a/backends/nxp/backend/ir/converter/builder/model_builder.py
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -48,6 +48,7 @@
     FlexTranspose,
 )
 from executorch.backends.nxp.backend.ir.tflite_optimizer import optimizer
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 
 
 class ModelBuilder:
@@ -74,17 +75,21 @@ class ModelBuilder:
 
     _zeros_tensor_map: Dict  # Mapping 'string' shapes to 'tflT.Tensor' objects
 
-    _default_conversion_config = ConversionConfig()
+    neutron_target_spec: NeutronTargetSpec
 
     conversion_config: ConversionConfig
 
+    _default_conversion_config = ConversionConfig()
+
     def __init__(
         self,
         model_version: int,
         model_description: str,
+        neutron_target_spec: NeutronTargetSpec,
         conversion_config: ConversionConfig = _default_conversion_config,
     ) -> None:
         self._tfl_model = tflite_model.Model(model_version, model_description)
+        self.neutron_target_spec = neutron_target_spec
         self.conversion_config = conversion_config
 
         self.op_code_type_index_map = {}
@@ -471,31 +476,7 @@ def finish(self) -> tflite_model.Model:
 
         return self._tfl_model
 
-    def _assign_tensor_and_buffer_indices(  # noqa C901
-        self, allow_inputs_stripping: bool
-    ):
-        """Correctly initialize all references via indices in all tensors and buffers."""
-
-        # Assign each buffer its index
-        for i, buffer in enumerate(self.get_buffers().vector):
-            buffer.tmp_index = i
-
-        # Assign each tensor its index and its buffer index
-        for i, tensor in enumerate(self.get_tensors().vector):
-            if tensor.tmp_null_tensor:
-                # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
-                #  this tensor should not be used.
-                # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
-                tensor.tmp_index = -1
-            else:
-                tensor.tmp_index = i
-
-            tensor.buffer = tensor.tmp_buffer.tmp_index
-
-        # TODO Remove inputs and outputs that are not in the tensors collection
-
-        # Assign 'Outputs' and 'Inputs' their tensor indices
-        outputs = self.get_sub_graph().outputs
+    def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
         for tensor in outputs.tmp_outputs:
             try:
                 outputs.append(tensor.tmp_index)
@@ -505,7 +486,6 @@ def _assign_tensor_and_buffer_indices(  # noqa C901
                     f"The tensor '{tensor.name}' is among the model outputs, but does NOT appear in the graph!",
                 )
 
-        inputs = self.get_sub_graph().inputs
         for tensor in inputs.tmp_inputs:
             try:
                 inputs.append(tensor.tmp_index)
@@ -520,14 +500,46 @@ def _assign_tensor_and_buffer_indices(  # noqa C901
                         f"The tensor '{tensor.name}' is among the model inputs, but does NOT appear in the graph!",
                     )
 
-        # Assign each operator its inputs and outputs indices
-        for operator in self.get_sub_graph().operators.vector:
+    def _assign_operators_io_tensor_indices(self, operators):
+        for operator in operators.vector:
             for inputTensor in operator.tmp_inputs:
                 operator.inputs.append(inputTensor.tmp_index)
 
             for outputTensor in operator.tmp_outputs:
                 operator.outputs.append(outputTensor.tmp_index)
 
+    def _assign_tensor_and_buffer_indices(self, allow_inputs_stripping: bool):
+        """Correctly initialize all references via indices in all tensors and buffers."""
+
+        # Assign each buffer its index
+        for i, buffer in enumerate(self.get_buffers().vector):
+            buffer.tmp_index = i
+
+        # Assign each tensor its index and its buffer index
+        for i, tensor in enumerate(self.get_tensors().vector):
+            if tensor.tmp_null_tensor:
+                # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
+                #  this tensor should not be used.
+                # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
+                tensor.tmp_index = -1
+            else:
+                tensor.tmp_index = i
+
+            tensor.buffer = tensor.tmp_buffer.tmp_index
+
+        # TODO Remove inputs and outputs that are not in the tensors collection
+
+        subgraph = self.get_sub_graph()
+
+        # Assign 'Outputs' and 'Inputs' their tensor indices
+        self._assign_io_tensor_indices(
+            inputs=subgraph.inputs,
+            outputs=subgraph.outputs,
+            allow_inputs_stripping=allow_inputs_stripping,
+        )
+        # Assign each operator its inputs and outputs indices
+        self._assign_operators_io_tensor_indices(operators=subgraph.operators)
+
     def _build_operator_code(
         self, op_type: BuiltinOperator, version, custom_code: str = None
     ):
diff --git a/backends/nxp/backend/ir/converter/node_converter.py b/backends/nxp/backend/ir/converter/node_converter.py
index c44a6e19955..36266486aac 100755
--- a/backends/nxp/backend/ir/converter/node_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converter.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from enum import Enum
 
 import torch
 
@@ -16,6 +15,7 @@
     AtenModelBuilderDirector,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx import Node
 from torch.fx.passes.infra.partitioner import Partition
@@ -42,17 +42,6 @@ def is_not_qdq_node(node: torch.fx.Node) -> bool:
     return not (_is_quant_node(node) or _is_dequant_node(node))
 
 
-class Target(Enum):
-    IGNORE = "ignore"  # No target platform. Any target specific restrictions will be ignored.
-
-    RT700 = "imxrt700"
-    IMX95 = "imx95"
-
-    @classmethod
-    def values(cls) -> list[str]:
-        return [elt.value for elt in cls]
-
-
 class NodeConverter(ABC):
     """
     Classes which implement conversion of torch.Node to TFLite should inherit from this class and overwrite the
@@ -94,7 +83,7 @@ def _is_supported_in_IR(
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
@@ -103,31 +92,31 @@ def _is_supported_on_target(
             can be used by operators with no target specific requirements.
 
         :param node: The node (edge operator) to check.
-        :param target: Value of the `Target` enum representing the target platform to check for.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param parameters_mapping: Dictionary mapping tensor names to their static data (if they have it).
         :param custom_delegation_options: Custom options which affect delegation.
         """
-        return target == Target.RT700
+        return True
 
     @classmethod
     def is_supported(
         cls,
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
         """Check if the given `node` is supported in the IR and on the given `target` platform.
 
         :param node: torch.Node to check.
-        :param target: Value of the `Target` enum representing the target platform to check for.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param parameters_mapping: Dict mapping tensor names to their data.
         :param custom_delegation_options: Custom user options which affect node delegation.
         """
         return cls._is_supported_in_IR(
             node, parameters_mapping, custom_delegation_options
         ) and cls._is_supported_on_target(
-            node, target, parameters_mapping, custom_delegation_options
+            node, neutron_target_spec, parameters_mapping, custom_delegation_options
         )
 
     @classmethod
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
index 472a3495e19..3cf70f46b8d 100755
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -56,6 +56,9 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.softmax_converter import (
     SoftmaxConverter,
 )
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.sub_tensor_converter import (
+    SubTensorConverter,
+)
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.tanh_converter import (
     TanhConverter,
 )
@@ -80,6 +83,7 @@
     "MaxPool2dConverter",
     "AvgPool2dConverter",
     "AddTensorConverter",
+    "SubTensorConverter",
     "CloneConverter",
     "AbsConverter",
     "AdaptiveAvgPool2dConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
index c74baa61f67..cd5aa2ead81 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
@@ -9,11 +9,11 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     add_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -22,20 +22,15 @@ class AddTensorConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                if node_uses_shape_broadcasting(node):
-                    # Shape broadcasting may require the addition of `Transpose` ops during conversion.
-                    return False
-
-                return True
+        if node_uses_shape_broadcasting(node):
+            # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+            return False
 
-            case _:
-                return False
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
index 4f7f00fe5ba..22ca258cd4f 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
@@ -13,11 +13,11 @@
     _is_dequant_node,
     _is_quant_node,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.concatenation_options import (
     Concatenation,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -72,51 +72,52 @@ def _all_io_shares_quantization_parameters(node: Node) -> bool:
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
         if custom_delegation_options.force_delegate_cat:
             return True
 
-        match target:
-            case Target.RT700:
-                dim = CatConverter._get_normalized_dim(node)
-
-                # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
-                if dim == 0:
-                    return False
-
-                # Neutron requires the channels to be a multiple of `8`. The channels could either be the second or the
-                #  last dimension, depending on the formats of the node. The format, however, cannot be determined
-                #  during conversion, as it depends on what other nodes are delegated.
-                input_channels = [
-                    # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
-                    #  will still be the channels in the IR.
-                    _get_shape(input_)[1]
-                    for input_ in node.all_input_nodes
-                ] + [
-                    # If the inputs/outputs are channels first, the last dimension will be the channels.
-                    _get_shape(input_)[-1]
-                    for input_ in node.all_input_nodes
-                ]
-                if any((input_channel % 8) != 0 for input_channel in input_channels):
-                    # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
-                    return False
-
-                output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
-                # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
-                if any((out_c % 8) != 0 for out_c in output_channels):
-                    return False
-
-                if len(node.all_input_nodes) < 2:  # Not supported on Neutron
-                    # TODO Try to skip the operator if this case is realistic.
-                    return False
-
-                return True
-
-            case _:
-                return False
+        dim = CatConverter._get_normalized_dim(node)
+
+        # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
+        if dim == 0:
+            return False
+
+        # Neutron requires the channels to be a multiple of numMacs. The channels could either be the second or the
+        #  last dimension, depending on the formats of the node. The format, however, cannot be determined
+        #  during conversion, as it depends on what other nodes are delegated.
+        input_channels = [
+            # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
+            #  will still be the channels in the IR.
+            _get_shape(input_)[1]
+            for input_ in node.all_input_nodes
+        ] + [
+            # If the inputs/outputs are channels first, the last dimension will be the channels.
+            _get_shape(input_)[-1]
+            for input_ in node.all_input_nodes
+        ]
+        if any(
+            (input_channel % neutron_target_spec.get_num_macs()) != 0
+            for input_channel in input_channels
+        ):
+            # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
+            return False
+
+        output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
+        # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
+        if any(
+            (out_c % neutron_target_spec.get_num_macs()) != 0
+            for out_c in output_channels
+        ):
+            return False
+
+        if len(node.all_input_nodes) < 2:  # Not supported on Neutron
+            # TODO Try to skip the operator if this case is realistic.
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
index 1d370ab8c48..17b2cee9874 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
@@ -20,6 +20,11 @@ def _has_supported_memory_format(node: Node) -> bool:
 
 
 class CloneConverter(NodeConverter):
+    """
+    This converter is responsible for converting both edge operators:
+    - aten.clone.default
+    - dim_order_ops._clone_dim_order.default
+    """
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
index f58df1a88d9..499541aa58c 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
@@ -17,7 +17,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
     quantize_int8,
@@ -27,6 +26,7 @@
     pad_options,
     pad_v2_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -35,22 +35,16 @@ class ConstantPadNDConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # TODO: Consider different tensor formats (dim-order)
-                paddings = node.args[1]
-                if len(paddings) > 4 and paddings[4:6] != [0, 0]:
-                    # Attempt to Pad channels dimension, which is not supported on Neutron.
-                    return False
-
-                return True
-
-            case _:
-                return False
+        paddings = node.args[1]
+        if len(paddings) > 4 and paddings[4:6] != [0, 0]:
+            # Attempt to Pad channels dimension, which is not supported on Neutron.
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
index 8955b4c8fd4..f32b5a65cac 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
@@ -25,7 +25,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.shared import (
     conv_utils,
@@ -45,6 +44,7 @@
     depthwise_conv_2d_options,
     reshape_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -53,45 +53,38 @@ class ConvolutionConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                activations = node.args[0]
-                weights = node.args[1]
-                groups = node.args[8]
-
-                if activations.meta["val"].shape[0] != 1:
-                    # Only batch size 1 is supported on neutron.
-                    return False
-
-                if groups == 1:  # Regular convolution.
-                    pass
-                elif conv_utils.group_conv_convertible_as_depthwise(
-                    node, groups
-                ):  # Depthwise convolution.
-                    # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
-                    #  weights. In case the weights are dynamic, a Transpose operator would have to be added, which
-                    #  is not supported on Neutron.
-                    if not node_is_effectively_static_tensor(
-                        weights, parameters_mapping
-                    ):
-                        return False
-                elif conv_utils.group_conv_convertible_into_multiple_convolutions(
-                    node, groups
-                ):  # Separable conv. This should never be reached, as the node should have been decomposed into
-                    #  multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
-                    logging.warning("Group convolution was not decomposed.")
-                    return False
-                else:  # Unexpected case (should never happen).
-                    return False
-
-                return True
-
-            case _:
+        activations = node.args[0]
+        weights = node.args[1]
+        groups = node.args[8]
+
+        if activations.meta["val"].shape[0] != 1:
+            # Only batch size 1 is supported on neutron.
+            return False
+
+        if groups == 1:  # Regular convolution.
+            pass
+        elif conv_utils.group_conv_convertible_as_depthwise(
+            node, groups
+        ):  # Depthwise convolution.
+            # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
+            #  weights. In case the weights are dynamic, a Transpose operator would have to be added, which
+            #  is not supported on Neutron.
+            if not node_is_effectively_static_tensor(weights, parameters_mapping):
                 return False
+        elif conv_utils.group_conv_convertible_into_multiple_convolutions(
+            node, groups
+        ):  # Separable conv. This should never be reached, as the node should have been decomposed into
+            #  multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
+            logging.warning("Group convolution was not decomposed.")
+            return False
+        else:  # Unexpected case (should never happen).
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
@@ -238,7 +231,7 @@ def _convert_1d_conv(
     def _convert_unpadded_2D(
         self, t_op: tflite_model.Operator, conv_params: ConvParameters
     ) -> conv_utils.ConvConversionResult:
-        """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converter by the
+        """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converted by the
         caller.
         """
         common.assign_2d_strides(t_op.builtin_options, conv_params.stride)
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
index f03c403876f..c1dd7b600be 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
@@ -12,7 +12,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reduce_utils import (
     convert_axes_from_attribute,
@@ -20,6 +19,7 @@
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     mean_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -28,34 +28,20 @@ class MeanDimConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # TODO: Consider different tensor formats (dim-order)
-                dim = node.args[1]
-                keepdim = node.args[2] if len(node.args) >= 3 else False
-                rank = len(node.args[0].meta["val"].shape)
-                dim = [MeanDimConverter._to_neg_dim(d, rank) for d in dim]
-
-                # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
-                if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
-                    return False
-
-                return True
-
-            case _:
-                return False
+        dim = node.args[1]
+        keepdim = node.args[2] if len(node.args) >= 3 else False
+        rank = len(node.args[0].meta["val"].shape)
+        dim = [d - rank if d > 0 else d for d in dim]
 
-    @staticmethod
-    def _to_pos_dim(d, rank):
-        return d + rank if d < 0 else d
+        # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
+        if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
+            return False
 
-    @staticmethod
-    def _to_neg_dim(d, rank):
-        return d - rank if d > 0 else d
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
@@ -75,6 +61,10 @@ def _is_supported_in_IR(
 
         return True
 
+    @staticmethod
+    def _to_pos_dim(d: int, rank: int):
+        return d + rank if d < 0 else d
+
     @staticmethod
     def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]:
         # convert negative index to positive
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
index aa74c78ca24..5e4404d8476 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
@@ -7,13 +7,11 @@
     CustomDelegationOptions,
 )
 from executorch.backends.nxp.backend.edge_helper import input_rank
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
-    NodeConverter,
-    Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     softmax_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -22,18 +20,11 @@ class SoftmaxConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # The eIQ Neutron NPU runtime software has a known issue with the SoftMax operation.
-                #  As long as the issue is present, return False for the i.MX RT700 target also.
-                return False
-
-            case _:
-                return False
+        return False
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
new file mode 100644
index 00000000000..e9522c87114
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
@@ -0,0 +1,59 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.conversion.common import (
+    node_uses_shape_broadcasting,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    CustomDelegationOptions,
+    NodeConverter,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    sub_options,
+)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class SubTensorConverter(NodeConverter):
+    @staticmethod
+    def _is_supported_on_target(
+        node: Node,
+        neutron_target_spec: NeutronTargetSpec,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        if node_uses_shape_broadcasting(node):
+            # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+            return False
+
+        return True
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        if len(node.args) != 2:
+            return False
+
+        # The `alpha` attribute can be represented by adding an extra `Mul` operator.
+        #  However, this is not implemented as `alpha` is rarely used.
+        if hasattr(node.kwargs, "alpha"):
+            return False
+
+        return True
+
+    # sub.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1)
+    def convert(self, node: Node):
+        """Convert 'sub_tensor' operator to NeutronIR 'Sub'."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        t_op.builtin_options = sub_options.Sub()
+        self.builder.append_operators([t_op])
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
index dc9ad9999b4..0be46efcaa8 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
index 2bc4380f89b..a6884a9ee24 100644
--- a/backends/nxp/backend/neutron_converter_manager.py
+++ b/backends/nxp/backend/neutron_converter_manager.py
@@ -7,8 +7,6 @@
 import multiprocessing
 import pkgutil
 
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
-
 
 def convert_unsafe(neutron_converter, tflite_model, cctx, queue):
     """
@@ -27,16 +25,7 @@ class NeutronConverterManager:
     contains NeutronGraph nodes.
     """
 
-    _supported_target_names = [Target.RT700.value]
-
-    def convert(
-        self, tflite_model: bytes, target: str, neutron_converter_flavor: str
-    ) -> bytes:
-        # Neutron converter crashes if we provide invalid target -> verify.
-        if target not in self._supported_target_names:
-            raise RuntimeError(
-                f"Target '{target}' is not supported by NeutronConverterManager."
-            )
+    def __init__(self, neutron_converter_flavor: str = "SDK_25_09"):
 
         neutron_converter_modules = [
             module.name
@@ -57,13 +46,34 @@ def convert(
                     f"not found. Install 'neutron_converter_[flavor]' Python package."
                 )
 
-        neutron_converter = importlib.import_module(
+        self.neutron_converter = importlib.import_module(
             f"{requested_module_name}.neutron_converter"
         )
+        self.neutron_library_utils = importlib.import_module(
+            f"{requested_module_name}.neutron_library_utils"
+        )
+
+    def get_converter(self):
+        return self.neutron_converter
+
+    def get_library_utils(self):
+        return self.neutron_library_utils
+
+    def verify_target(self, target: str):
+        if not self.neutron_library_utils.isNeutronTarget(target):
+            valid_targets = [
+                target.name for target in self.neutron_library_utils.getNeutronTargets()
+            ]
+            raise ValueError(
+                f"Target `{target}` is not a valid target. Must be one of `{valid_targets}`."
+            )
+
+    def convert(self, tflite_model: bytes, target: str) -> bytes:
+        # Neutron converter crashes if we provide invalid target -> verify.
+        self.verify_target(target)
 
-        cctx = neutron_converter.CompilationContext()
-        cctx.targetOpts = neutron_converter.getNeutronTarget(target)
-        # New switch since Neutron Converter SDK_25.06
+        cctx = self.neutron_converter.CompilationContext()
+        cctx.targetOpts = self.neutron_converter.getNeutronTarget(target)
         cctx.compilationOpts.minNumOpsPerGraph = 1
 
         logger = multiprocessing.log_to_stderr()
@@ -71,7 +81,8 @@ def convert(
         queue = multiprocessing.Manager().Queue()
 
         process = multiprocessing.Process(
-            target=convert_unsafe, args=(neutron_converter, tflite_model, cctx, queue)
+            target=convert_unsafe,
+            args=(self.neutron_converter, tflite_model, cctx, queue),
         )
         process.start()
         process.join()  # waits until the subprocess is complete
diff --git a/backends/nxp/backend/neutron_target_spec.py b/backends/nxp/backend/neutron_target_spec.py
new file mode 100644
index 00000000000..44399982e29
--- /dev/null
+++ b/backends/nxp/backend/neutron_target_spec.py
@@ -0,0 +1,64 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Target Spec for the NXP Neutron NPU
+
+from enum import Enum
+
+from executorch.backends.nxp.backend.neutron_converter_manager import (
+    NeutronConverterManager,
+)
+
+
+class NeutronHWVersion(Enum):
+    N1 = 1
+    N3 = 2
+
+
+class NeutronTargetSpec:
+    """
+    The functionality for probing the properties of Neutron Target.
+    """
+
+    def __init__(self, target: str, neutron_converter_flavor: str):
+
+        converter_manager = NeutronConverterManager(neutron_converter_flavor)
+        converter_manager.verify_target(target)
+        neutron_converter = converter_manager.get_converter()
+        self.neutron_target = neutron_converter.getNeutronTarget(target)
+
+        if self.is_subsystem():
+            raise ValueError(
+                f"Target `{target}` is not a neutron-C target. Only MCU targets are supported at the moment."
+            )
+
+        if self.get_hw_version() != NeutronHWVersion.N3:
+            raise ValueError(
+                f"Target `{target}` contains unsupported HW version. Only N3/N3+ targets are supported at the moment."
+            )
+
+    # Target name.
+    def get_name(self) -> str:
+        return self.neutron_target.name
+
+    # Whether the target has subsystem (Neutron-S) or not (Neutron-C).
+    def is_subsystem(self) -> bool:
+        return self.neutron_target.subsystem
+
+    # Number of compute units.
+    def get_num_units(self) -> int:
+        return self.neutron_target.numUnits
+
+    # Number of compute pipelines.
+    def get_num_pipes(self) -> int:
+        return self.neutron_target.numPipes
+
+    # Number of compute MACs.
+    def get_num_macs(self) -> int:
+        return self.neutron_target.numMacs
+
+    # Neutron compute block hardware version.
+    def get_hw_version(self) -> NeutronHWVersion:
+        return NeutronHWVersion(self.neutron_target.hwVersion)
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index 371b7474f58..965ad41309b 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -8,7 +8,7 @@
 import logging
 import operator
 from dataclasses import dataclass
-from typing import Dict, final, List, Mapping
+from typing import final, Mapping
 
 import torch
 
@@ -18,13 +18,13 @@
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
 from torch.export.exported_program import ExportedProgram
 from torch.fx import Graph
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
 from torch.nn import Parameter
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.nxp_backend import NeutronBackend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
@@ -64,7 +64,7 @@ class QDQCluster:
         """
 
         compute_node: torch.fx.Node
-        ops: List[torch.fx.Node]
+        ops: list[torch.fx.Node]
 
     QUANTIZE_OPERATORS = [
         exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
@@ -97,7 +97,7 @@ def is_dequant_node(node: torch.fx.Node) -> bool:
     def is_auxiliary_node(node: torch.fx.Node) -> bool:
         return node.target in QDQClusterRecognizer.AUXILIARY_OPS
 
-    def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Return the list of nodes representing the input part of the QDQ cluster of the node `node`.
         Those are various dequantization nodes (see DEQUANTIZE_OPERATORS) optionally followed by auxiliary
@@ -125,7 +125,7 @@ def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]
         logging.debug(f"Dequant Cluster for {node} is: {qdq_cluster}")
         return qdq_cluster
 
-    def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Returns the list of nodes representing the output part of the QDQ cluster of the `node`.
         Those are various quantize nodes (see QUANTIZE_OPERATORS) preceded by auxiliary nodes.
@@ -155,7 +155,7 @@ def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node
         logging.debug(f"Quant Cluster for {node} is {qdq_cluster}")
         return qdq_cluster
 
-    def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Returns the QDQ cluster of the operator, if quantized. If operator is not quantized, returns empty list.
         """
@@ -167,7 +167,7 @@ def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
         else:
             return []
 
-    def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
+    def tag_nodes(self, nodes: list[torch.fx.Node], cluster_name: str) -> None:
         """
         Tags a node and its related dequant and quant nodes with a specified cluster name
         """
@@ -175,7 +175,7 @@ def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
             logging.info(f"Tagging node {node} as {cluster_name}")
             node.meta["cluster"] = cluster_name
 
-    def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
+    def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
         """
         Identifies QDQ clusters and tag them based on compute operation inside.
         """
@@ -201,6 +201,7 @@ def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
+    exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
@@ -210,6 +211,7 @@ def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
     exir_ops.edge.aten.mm.default: MMConverter,  # noqa F405
     exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
     exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.sub.Tensor: SubTensorConverter,  # noqa F405
     exir_ops.edge.aten.tanh.default: TanhConverter,  # noqa F405
     exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
     exir_ops.edge.aten.sigmoid.default: SigmoidConverter,  # noqa F405
@@ -220,14 +222,14 @@ class NeutronSupportedOperators(OperatorSupportBase):
 
     def __init__(
         self,
-        qdq_clusters: Dict[str, QDQClusterRecognizer.QDQCluster],
-        target: Target,
-        operators_not_to_delegate: List[str],
+        qdq_clusters: dict[str, QDQClusterRecognizer.QDQCluster],
+        neutron_target_spec: NeutronTargetSpec,
+        operators_not_to_delegate: list[str],
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ):
         self.qdq_clusters = qdq_clusters
-        self.target = target
+        self.neutron_target_spec = neutron_target_spec
         self.operators_not_to_delegate = operators_not_to_delegate
         self.parameters_mapping = parameters_mapping
         self.custom_delegation_options = custom_delegation_options
@@ -269,7 +271,7 @@ def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool:
             # TODO: `view_copy` node should be delegated only if it's not the only operator in the cluster.
             node_converter.is_supported(
                 node,
-                self.target,
+                self.neutron_target_spec,
                 self.parameters_mapping,
                 self.custom_delegation_options,
             )
@@ -305,13 +307,16 @@ def is_node_supported(
 class NeutronPartitioner(Partitioner):
     def __init__(
         self,
-        compile_spec: List[CompileSpec],
+        compile_spec: list[CompileSpec],
         custom_delegation_options: CustomDelegationOptions | None = None,
     ) -> None:
         self.delegation_spec = DelegationSpec(NeutronBackend.__name__, compile_spec)
         self.custom_delegation_options = (
             custom_delegation_options or CustomDelegationOptions()
         )
+        target = self.delegation_spec[1][2].value.decode()
+        converter_flavor = self.delegation_spec[1][3].value.decode()
+        self.neutron_target_spec = NeutronTargetSpec(target, converter_flavor)
 
     def validate_partitioning_result(
         self,
@@ -343,22 +348,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # subgraphs containing the nodes with the tags
         logging.info("NeutronPartitioner::partition")
         partition_tags = {}
+        partition_list = []
 
         graph_module = exported_program.graph_module
         nodes = list(graph_module.graph.nodes)
 
         qdq_cluster_recognizer = QDQClusterRecognizer()
         qdq_cluster_recognizer.tag_qdq_clusters(nodes)
+
         graph_module.recompile()
 
-        target = None
-        operators_not_to_delegate = ""
-        for spec in self.delegation_spec.compile_specs:
-            if spec.key == "target":
-                target = Target(spec.value.decode())
-            if spec.key == "operators_not_to_delegate":
-                operators_not_to_delegate = spec.value.decode().split(",")
-        assert target is not None
+        operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",")
         logging.info(f"Operators not to delegate: {operators_not_to_delegate}")
 
         parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters(
@@ -368,7 +368,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             exported_program.graph_module,
             NeutronSupportedOperators(
                 qdq_cluster_recognizer.cluster_map,
-                target,
+                self.neutron_target_spec,
                 operators_not_to_delegate,
                 parameters_mapping,
                 self.custom_delegation_options,
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index fd1687d73fd..44e9a19d9f2 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,11 +18,11 @@
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
 from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
 from executorch.backends.nxp.backend.neutron_converter_manager import (
     NeutronConverterManager,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.neutron_node_extraction import (
     extract_artifacts_from_neutron_node,
     NeutronNodeArtifacts,
@@ -36,9 +36,9 @@
 
 
 class NeutronCompileSpecBuilder:
+    config: NeutronTargetSpec
 
     def __init__(self):
-        self.config: Target = None
         self.compile_spec: List[CompileSpec] = []
         self.compiler_flags = []
         self.output_format = None
@@ -68,14 +68,9 @@ def neutron_compile_spec(
             extra_flags: Extra flags for the Neutron compiler
             operators_not_to_delegate: List of operators that should not be delegated
         """
-        try:
-            self.config = Target(config)
-        except ValueError:
-            raise ValueError(
-                f"Config `{config}` is not a valid target. Must be one of `{Target.values()}`."
-            )
 
         self.neutron_converter_flavor = neutron_converter_flavor
+        self.config = NeutronTargetSpec(config, neutron_converter_flavor)
 
         assert (
             self.output_format is None
@@ -101,7 +96,7 @@ def build(self):
             self.compile_spec += [
                 CompileSpec("output_format", "tflite".encode()),
                 CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
-                CompileSpec("target", self.config.value.encode()),
+                CompileSpec("target", self.config.get_name().encode()),
                 CompileSpec(
                     "neutron_converter_flavor", self.neutron_converter_flavor.encode()
                 ),
@@ -187,10 +182,11 @@ def preprocess(  # noqa C901
             # Convert the edge program to TFLite.
             tflite_model, io_formats = EdgeProgramToIRConverter().convert_program(
                 edge_program,
+                neutron_target_spec=NeutronTargetSpec(target, neutron_converter_flavor),
             )
 
-            neutron_model = NeutronConverterManager().convert(
-                tflite_model, target, neutron_converter_flavor
+            neutron_model = NeutronConverterManager(neutron_converter_flavor).convert(
+                tflite_model, target
             )
 
             # Dump the tflite file if logging level is enabled
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index db19bcb8ba8..2681e221869 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -36,6 +36,7 @@
     SharedSpecPattern,
     SigmoidPattern,
     SoftMaxPattern,
+    SubTensorPattern,
     TanhInPlacePattern,
     TanhPattern,
     ViewPattern,
@@ -208,6 +209,7 @@ def __init__(self):
                 NeutronAtenQuantizer(ReshapePattern(), static_qconfig),
                 NeutronAtenQuantizer(SigmoidPattern(), static_qconfig),
                 NeutronAtenQuantizer(SoftMaxPattern(), static_qconfig),
+                NeutronAtenQuantizer(SubTensorPattern(), static_qconfig),
                 NeutronAtenQuantizer(TanhPattern(), static_qconfig),
                 NeutronAtenQuantizer(TanhInPlacePattern(), static_qconfig),
                 NeutronAtenQuantizer(ViewPattern(), static_qconfig),
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index 34ee611b8b2..9588ce24c9e 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -224,6 +224,32 @@ def get_anchors(
         )
 
 
+class SubTensorPattern(QuantizationPattern):
+    """
+    Quantization pattern for Sub Tensor quantization. Accepts 1 or 2 input nodes.
+
+    Basic quantization for all inputs and output.
+    """
+
+    def partition_types(self) -> list[torch.nn.Module]:
+        return [torch.ops.aten.sub.Tensor]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        node = fused_partition[0].nodes[-1]
+        inputs = [(node, NodeArgsIdx(0))]
+        if len(fused_partition[0].input_nodes) == 2:
+            inputs = [(node, NodeArgsIdx(0)), (node, NodeArgsIdx(1))]
+
+        return PartitionAnchors(
+            inputs=inputs,
+            weights=[],
+            biases=[],
+            output=[(node,)],
+        )
+
+
 class AvgPoolPattern(SharedSpecPattern):
     """
     Quantizer for AvgPool2D operator.
diff --git a/backends/nxp/tests/TARGETS b/backends/nxp/tests/TARGETS
index f492111aff2..c8ccd5fe900 100644
--- a/backends/nxp/tests/TARGETS
+++ b/backends/nxp/tests/TARGETS
@@ -1,3 +1,4 @@
+load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 
@@ -50,5 +51,9 @@ python_pytest(
         "//executorch/backends/nxp:neutron_backend",
         ":executorch_pipeline",
         ":models",
-    ]
+    ],
+    labels = [
+        "local_only",
+        ci.skip_test(),
+    ],
 )
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
index 592717c0b3b..632e3da055f 100644
--- a/backends/nxp/tests/executors.py
+++ b/backends/nxp/tests/executors.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,10 +18,8 @@
     create_channels_first_to_channels_last_permutation,
     create_channels_last_to_channels_first_permutation,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
-    NodeConverter,
-    Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.export import ExportedProgram
 from torch.fx import Node
 from torch.fx.graph import Graph
@@ -370,10 +368,16 @@ def convert_run_compare(
 
 
 def graph_contains_any_of_ops(graph: Graph, ops: list) -> bool:
-    return any(node.target in ops for node in graph.nodes)
+    return graph_contains_any(
+        graph, condition=lambda n: hasattr(n, "target") and n.target in ops
+    )
+
+
+def graph_contains_any(graph: Graph, condition: Callable[[Node], bool]) -> bool:
+    return any(map(condition, graph.nodes))
 
 
-target_support_check_function = Callable[[Node, Target], bool]
+target_support_check_function = Callable[[Node, NeutronTargetSpec], bool]
 
 
 class OverrideTargetSupportCheck:
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
index 567b593e05b..2c3107eae77 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
@@ -1,3 +1,7 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 import numpy as np
 import pytest
 import torch
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
index f5945607f1b..c02d184c5ae 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
@@ -4,31 +4,33 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import itertools
+import unittest
+
+import kgb
 import numpy as np
-import pytest
 import torch
 
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
 from executorch.backends.nxp.tests.executors import (
     convert_run_compare,
+    graph_contains_any,
     graph_contains_any_of_ops,
-    ToNCHWPreprocess,
-    ToNHWCPreprocess,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
+from parameterized import parameterized
 from torch import nn
 from torch.export import ExportedProgram
 
 
-@pytest.fixture(autouse=True)
-def reseed_model_per_test_run():
-    torch.manual_seed(23)
-    np.random.seed(23)
-
-
 class SingleConvBlockWithDropout(torch.nn.Module):
     def __init__(
         self, conv_in_channels: int = 3, perform_inplace_dropout: bool = False
@@ -74,57 +76,108 @@ def forward(self, x):
         return self.block(x)
 
 
-@pytest.mark.parametrize("inplace_dropout", [False, True])
-@pytest.mark.parametrize("input_shape", [(1, 3, 128, 128), (1, 3, 256, 256)])
-def test_conv_dropout_quant(mocker, inplace_dropout: bool, input_shape: tuple[int]):
-    model = SingleConvBlockWithDropout(
-        conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
-    ).eval()
+class TestCloneConverter(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(23)
 
-    quantized_program = to_quantized_edge_program(model, input_shape).exported_program()
+    @staticmethod
+    def _node_is_clone(node) -> bool:
+        clone_ops = [
+            exir_ops.edge.aten.clone.default,
+            exir_ops.edge.dim_order_ops._clone_dim_order.default,
+        ]
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
-
-    assert not graph_contains_any_of_ops(
-        graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default]
-    )
-
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToNHWCPreprocess(),
-        tflite_output_preprocess=ToNCHWPreprocess(),
-        input_data=input_data,
-        atol=1.0,
-    )
+        def target_can_be_clone(node):
+            if hasattr(node, "op") and node.op == "call_function":
+                return "clone" in node.target.__name__
 
+            return False
 
-@pytest.mark.parametrize("inplace_dropout", [False, True])
-def test_clone_pool_view_copy_quant(
-    mocker, inplace_dropout: bool, input_shape: tuple[int] = (1, 64, 25, 5)
-):
-    model = KWSFinalBlock(input_shape).eval()
+        return node in clone_ops or target_can_be_clone(node)
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
-
-    quantized_program = to_quantized_edge_program(model, input_shape).exported_program()
-
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
-
-    assert not graph_contains_any_of_ops(
-        graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default]
+    @parameterized.expand(
+        list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)]))
     )
-
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToNHWCPreprocess(),
-        input_data=input_data,
-        atol=1.0,
+    def test_conv_dropout_quant(self, inplace_dropout: bool, input_shape: tuple[int]):
+        model = SingleConvBlockWithDropout(
+            conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
+        ).eval()
+
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            quantized_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            assert not graph_contains_any(
+                graph=quantized_program.graph,
+                condition=TestCloneConverter._node_is_clone,
+            )
+
+            input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+            convert_run_compare(
+                exported_program,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                tflite_output_preprocess=ToChannelFirstPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )
+
+    @parameterized.expand(
+        list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)]))
     )
+    def test_conv_dropout_no_quant(
+        self, inplace_dropout: bool, input_shape: tuple[int]
+    ):
+        model = SingleConvBlockWithDropout(
+            conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
+        ).eval()
+
+        edge_program = to_edge_program(model, input_shape).exported_program()
+
+        has_clone = graph_contains_any_of_ops(
+            graph=edge_program.graph,
+            ops=[
+                exir_ops.edge.aten.clone.default,
+                exir_ops.edge.dim_order_ops._clone_dim_order.default,
+            ],
+        )
+
+        # Clone with inplace=True should not produce clone edge op and vice versa
+        assert inplace_dropout ^ has_clone
+
+    def test_clone_pool_view_copy_quant(self, input_shape: tuple[int] = (1, 64, 25, 5)):
+        model = KWSFinalBlock(input_shape).eval()
+
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            quantized_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            assert not graph_contains_any(
+                graph=quantized_program.graph,
+                condition=TestCloneConverter._node_is_clone,
+            )
+
+            input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+            convert_run_compare(
+                exported_program,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
new file mode 100644
index 00000000000..98566ff1ad6
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
@@ -0,0 +1,175 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import (
+    SubTensorConvModule,
+    SubTensorModule,
+    SubTensorOneInputModule,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    [
+        pytest.param((4,), id="1D."),
+        pytest.param((6, 6), id="2D."),
+        pytest.param((1, 4, 8), id="3D."),
+        pytest.param((1, 4, 8, 8), id="4D."),
+    ],
+)
+def test_sub_tensor_quant_conversion(mocker, input_shape):
+    model = SubTensorModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, [input_shape, input_shape])
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data_1 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data_2 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data = {0: input_data_1, 1: input_data_2}
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[4].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+    )
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    [
+        pytest.param((4,), id="1D."),
+        pytest.param((6, 6), id="2D."),
+        pytest.param((1, 4, 8), id="3D."),
+        pytest.param((1, 4, 8, 8), id="4D."),
+    ],
+)
+def test_sub_tensor_one_input_quant_conversion(mocker, input_shape):
+    model = SubTensorOneInputModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[2].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+    )
+
+
+@pytest.mark.parametrize(
+    "x_input_shape",
+    [
+        pytest.param((1, 4, 8, 8), id="4D."),
+        pytest.param((1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."),
+    ],
+)
+def test_sub_tensor_w_conv_quant_conversion(mocker, x_input_shape):
+    model = SubTensorConvModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    n, c, h, w = x_input_shape
+    y_input_shape = (n, 8, h, w)
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, [x_input_shape, y_input_shape])
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data_1 = (np.random.random(x_input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data_2 = (np.random.random(y_input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data = {0: input_data_1, 1: input_data_2}
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[15].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program,
+        input_data=input_data,
+        tflite_input_preprocess=ToChannelLastPreprocess(),
+        tfl_model=tflite_flatbuffers_model,
+        tflite_output_preprocess=ToChannelFirstPreprocess(),
+    )
+
+
+@pytest.mark.parametrize(
+    "x_input_shape, y_input_shape",
+    [
+        pytest.param((1, 4, 7), (4, 7), id="3D -> 2D."),
+        pytest.param((1, 4, 8), (1, 4, 4, 8), id="3D -> 4D."),
+        pytest.param((1, 1, 4, 4, 8), (1, 4, 4, 8), id="5D -> 4D."),
+        pytest.param((4,), (4, 4), id="1D -> 2D."),
+        pytest.param((4,), (4, 4, 4), id="1D -> 3D."),
+        pytest.param((6, 6), (1, 8, 6, 6), id="2D -> 4D."),
+        pytest.param((6, 6), (6,), id="2D -> 1D."),
+    ],
+)
+def test_sub_tensor_broadcasting_unsupported_quant_conversion(
+    x_input_shape, y_input_shape
+):
+    model = SubTensorModule()
+
+    # Run conversion
+    edge_program = to_quantized_edge_program(
+        model, [x_input_shape, y_input_shape]
+    ).exported_program()
+    nodes = list(edge_program.graph.nodes)
+
+    # Broadcast is not supported, node is not converted
+    assert (
+        nodes[6].target == exir_ops.edge.aten.sub.Tensor
+    )  # Sub Tensor is not delegated.
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
index e7b60b2566c..f613349fed0 100644
--- a/backends/nxp/tests/models.py
+++ b/backends/nxp/tests/models.py
@@ -451,6 +451,34 @@ def forward(x):
         return x + x
 
 
+class SubTensorModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def forward(x, y):
+        return x - y
+
+
+class SubTensorConvModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = Conv2dModule(padding=1, stride=1)
+
+    def forward(self, x, y):
+        x = self.conv(x)
+        return x - y
+
+
+class SubTensorOneInputModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def forward(x):
+        return x - x
+
+
 class MeanDimLinearModule(torch.nn.Module):
     def __init__(self, dim, keepdim):
         super().__init__()
diff --git a/backends/nxp/tests/test_neutron_backend.py b/backends/nxp/tests/test_neutron_backend.py
index 53e54ec2f56..c9917651fbd 100644
--- a/backends/nxp/tests/test_neutron_backend.py
+++ b/backends/nxp/tests/test_neutron_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py
index e10e8cca67b..2fcfd8cd987 100644
--- a/backends/nxp/tests/test_neutron_converter_manager.py
+++ b/backends/nxp/tests/test_neutron_converter_manager.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -29,9 +29,7 @@ def test_conv2d_neutron_conversion__default_flavor():
     )
 
     neutron_converter_manager = NeutronConverterManager()
-    neutron_model = neutron_converter_manager.convert(
-        tflite_model, "imxrt700", "SDK_25_09"
-    )
+    neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700")
 
     assert len(
         neutron_model
@@ -50,9 +48,8 @@ def test__conv2d_neutron_conversion__invalid_flavor():
         edge_program_manager.exported_program()
     )
 
-    neutron_converter_manager = NeutronConverterManager()
     with pytest.raises(RuntimeError) as excinfo:
-        _ = neutron_converter_manager.convert(tflite_model, "imxrt700", "bad_flavor")
+        _ = NeutronConverterManager("bad_flavor").convert(tflite_model, "imxrt700")
 
     assert "Neutron Converter module with flavor 'bad_flavor' not found." in str(
         excinfo
diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py
index 21ab1c5b59a..4c9f277e34d 100644
--- a/backends/nxp/tests/test_split_group_convolution.py
+++ b/backends/nxp/tests/test_split_group_convolution.py
@@ -110,7 +110,7 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int
         input_data = torch.randn(input_shape, dtype=torch.float32)
         out1 = original_module(input_data).detach().numpy()
         out2 = modified_module(input_data).detach().numpy()
-        assert np.allclose(out1, out2, atol=2.0e-7)
+        assert np.allclose(out1, out2, atol=2.0e-7, rtol=1.9e-4)
 
         # Make sure the graph can be correctly quantized and lowered to edge.
         ep = _quantize_and_lower_module(
diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index 4d32d8932c2..736ed6d8603 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -53,35 +53,11 @@ target_sources(
 
 executorch_target_link_options_shared_lib(openvino_backend)
 
-if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
-  # Build executor runner binary for openvino backend
-  list(APPEND openvino_executor_runner_libs openvino_backend executorch)
-
-  set(_openvino_executor_runner__srcs
-      ${EXECUTORCH_ROOT}/examples/portable/executor_runner/executor_runner.cpp
-      ${EXECUTORCH_ROOT}/extension/data_loader/file_data_loader.cpp
-      ${EXECUTORCH_ROOT}/extension/evalue_util/print_evalue.cpp
-      ${EXECUTORCH_ROOT}/extension/runner_util/inputs.cpp
-      ${EXECUTORCH_ROOT}/extension/runner_util/inputs_portable.cpp
-  )
-  add_executable(openvino_executor_runner ${_openvino_executor_runner__srcs})
-
-  list(APPEND openvino_executor_runner_libs)
-
-  target_link_libraries(
-    openvino_executor_runner gflags portable_ops_lib
-    ${openvino_executor_runner_libs}
-  )
-  target_compile_options(
-    openvino_executor_runner PUBLIC ${_common_compile_options}
-  )
-endif()
-
 # Install OpenVINO backend library to the lib directory
 install(
   TARGETS openvino_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index a67cf12eca2..5ce38ade56f 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -18,6 +18,11 @@ For more information on the supported hardware, please refer to [OpenVINO System
 executorch
 ├── backends
 │   └── openvino
+│       ├── quantizer
+│           ├── observers
+│               └── nncf_observers.py
+│           ├── __init__.py
+│           └── quantizer.py
 │       ├── runtime
 │           ├── OpenvinoBackend.cpp
 │           └── OpenvinoBackend.h
@@ -42,11 +47,23 @@ executorch
 
 Before you begin, ensure you have openvino installed and configured on your system.
 
-### Build OpenVINO from Source
+### Use OpenVINO from Release Packages
+
+1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform.
+
+2. Extract the release package from the archive and set the environment variables.
+
+   ```bash
+   tar -zxf openvino_toolkit_<your_release_configuration>.tgz
+   cd openvino_toolkit_<your_release_configuration>
+   source setupvars.sh
+   ```
+
+### (Optional) Build OpenVINO from Source
 
 ```bash
 git clone https://github.com/openvinotoolkit/openvino.git
-cd openvino && git checkout b16b776ac119dafda51f69a80f1e6b7376d02c3b
+cd openvino
 git submodule update --init --recursive
 sudo ./install_build_dependencies.sh
 mkdir build && cd build
@@ -59,44 +76,45 @@ cd <your_preferred_install_location>
 source setupvars.sh
 ```
 
-### Use OpenVINO from Release Packages
-
-1. Download the OpenVINO release package from [here](https://docs.openvino.ai/2025/get-started/install-openvino.html). Make sure to select your configuration and click on **OpenVINO Archives** under the distribution section to download the appropriate archive for your platform.
-
-2. Extract the release package from the archive and set the environment variables.
-
-   ```bash
-   tar -zxf openvino_toolkit_<your_release_configuration>.tgz
-   cd openvino_toolkit_<your_release_configuration>
-   source setupvars.sh
-   ```
-
 For more information about OpenVINO build, refer to the [OpenVINO Build Instructions](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/build_linux.md).
 
 ### Setup
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
-2. **Setup OpenVINO Backend Environment**
-- Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
+1. **Create a Virtual Environment**
+- Create a virtual environment and activate it by executing the commands below.
    ```bash
-   pip install -r requirements.txt
+   python -m venv env
+   source env/bin/activate
    ```
-  Note: To achieve optimal performance with NNCF quantization, you should install the latest development version of NNCF (version 2.16.0.dev0+191b53d9 or higher).
-3. Navigate to `scripts/` directory.
-
-4. **Build OpenVINO Backend C++ Libraries and Executor Runner**: Once the prerequisites are in place, run the `openvino_build.sh` script to start the build process. By default, OpenVINO backend will be built under `cmake-out/backends/openvino/` as `libopenvino_backend.a`
-
+2. **Clone ExecuTorch Repository from Github**
+- Clone Executorch repository by executing the command below.
    ```bash
-   ./openvino_build.sh
+   git clone --recurse-submodules https://github.com/pytorch/executorch.git
    ```
-   **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
-
+3. **Build ExecuTorch with OpenVINO Backend**
+- Ensure that you are inside `executorch/backends/openvino/scripts` directory. The following command builds and installs ExecuTorch with the OpenVINO backend, also compiles the C++ runtime libraries and binaries into `<executorch_root>/cmake-out` for quick inference testing.
    ```bash
+   openvino_build.sh
+   ```
+- Optionally, `openvino_build.sh` script can be used to build python package or C++ libraries/binaries seperately.
+
+   **Build OpenVINO Backend Python Package with Pybindings**: To build and install the OpenVINO backend Python package with Python bindings, run the `openvino_build.sh` script with the `--enable_python` argument as shown in the below command. This will compile and install the ExecuTorch Python package with the OpenVINO backend into your Python environment. This option will also enable python bindings required to execute OpenVINO backend tests and `aot_optimize_and_infer.py` script inside `executorch/examples/openvino` folder.
+     ```bash
    ./openvino_build.sh --enable_python
    ```
+   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `<executorch_root>/cmake-out` directory. The binary located at `<executorch_root>/cmake-out/executor_runner` can be used to run inference with vision models.
+     ```bash
+   ./openvino_build.sh --cpp_runtime
+   ```
+   **Build C++ Llama Runner**: First, ensure the C++ runtime libraries are built by following the earlier instructions. Then, run the `openvino_build.sh` script with the `--llama_runner flag` to compile the LlaMA runner as shown the below command, which enables executing inference with models exported using export_llama. The resulting binary is located at: `<executorch_root>/cmake-out/examples/models/llama/llama_main`
+     ```bash
+   ./openvino_build.sh --llama_runner
+   ```
+
+For more information about ExecuTorch environment setup, refer to the [Environment Setup](https://pytorch.org/executorch/main/getting-started-setup#environment-setup) guide.
 
 ### Run
 
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index bc3fde573e2..0d407e33f6e 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -26,12 +26,24 @@
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 
+class PatternNode:
+    op_types: dict[str, Optional[list]] = {}
+
+    def __init__(self):
+        self.op_types = {}
+
+
 class OpenvinoOperatorsSupport(OperatorSupportBase):
+    extended_support_dict = {
+        "torch.ops.dim_order_ops._clone_dim_order.default": None,
+        "torch.ops.dim_order_ops._to_dim_order_copy.default": None,
+    }
 
     def __init__(
         self,
         op_types_to_skip: Optional[set] = None,
         op_names_to_skip: Optional[set] = None,
+        enabled_ops_by_name: Optional[set] = None,
     ) -> None:
         """
         Initializes the OpenvinoOperatorsSupport class.
@@ -43,9 +55,12 @@ def __init__(
             op_types_to_skip = set()
         if op_names_to_skip is None:
             op_names_to_skip = set()
+        if enabled_ops_by_name is None:
+            enabled_ops_by_name = set()
 
         self._op_types_to_skip = op_types_to_skip
         self._op_names_to_skip = op_names_to_skip
+        self._enabled_ops_by_name = enabled_ops_by_name
 
     def is_node_supported(self, _, node: torch.fx.Node) -> bool:
         """
@@ -62,7 +77,13 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             op_type = node.target.__name__
         else:
             op_type = str(node.target)
-        supported_ops = OperatorSupport(options)._support_dict
+
+        if node.name in self._enabled_ops_by_name:
+            return True
+
+        supported_ops = (
+            OperatorSupport(options)._support_dict | self.extended_support_dict
+        )
         if op_type == "getitem":
             return True
 
@@ -99,6 +120,7 @@ def __init__(
         self.delegation_spec = DelegationSpec(OpenvinoBackend.__name__, compile_spec)
         self._op_types_to_skip = op_types_to_skip
         self._op_names_to_skip = op_names_to_skip
+        self._enabled_ops_by_name: set = set()
 
     def ops_to_not_decompose(
         self,
@@ -117,9 +139,72 @@ def ops_to_not_decompose(
             torch.ops.aten.upsample_bilinear2d.vec,
             torch.ops.aten.upsample_nearest2d.default,
             torch.ops.aten.upsample_nearest2d.vec,
+            torch.ops.aten.stack.default,
         ]
         return (ops_not_decompose, None)
 
+    def check_pattern(
+        self, node: torch.fx.Node, pattern: type[PatternNode], enabled_ops: list
+    ) -> bool:
+        if node.op == "call_function":
+            if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:  # type: ignore[union-attr]
+                pt_input_nodes = node.all_input_nodes
+                pattern_input_ops = pattern.op_types[
+                    "call_function" + ":" + str(node.target.__name__)  # type: ignore[union-attr]
+                ]
+                if pattern_input_ops is None:
+                    enabled_ops.append(node)
+                    return True
+                if len(pt_input_nodes) != len(pattern_input_ops):
+                    return False
+                for i in range(len(pt_input_nodes)):
+                    if not self.check_pattern(
+                        pt_input_nodes[i], pattern_input_ops[i], enabled_ops
+                    ):
+                        return False
+                enabled_ops.append(node)
+                return True
+        elif node.op == "get_attr":
+            if "get_attr" in pattern.op_types:
+                return True
+            else:
+                return False
+        elif node.op == "placeholder":
+            if "placeholder" in pattern.op_types:
+                return True
+            else:
+                return False
+        return False
+
+    def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
+        const_node = PatternNode
+        const_node.op_types["get_attr"] = None
+        const_node.op_types["placeholder"] = None
+        bitwise_right_shift_node = PatternNode
+        bitwise_right_shift_node.op_types[
+            "call_function:aten.bitwise_right_shift.Tensor_Scalar"
+        ] = [const_node]
+        bitwise_and_node = PatternNode
+        bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [
+            const_node
+        ]
+        stack_node = PatternNode
+        stack_node.op_types["call_function:aten.stack.default"] = [
+            bitwise_and_node,
+            bitwise_right_shift_node,
+        ]
+
+        for node in graph_module.graph.nodes:
+            if (
+                str(node.op) == "call_function"
+                and str(node.target.__name__) == "aten.stack.default"
+            ):
+                enabled_ops: list = []
+                pattern_match = self.check_pattern(node, stack_node, enabled_ops)
+                if pattern_match:
+                    for pattern_op in enabled_ops:
+                        self._enabled_ops_by_name.add(pattern_op.name)
+
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
         Partitions an exported program into supported and unsupported segments.
@@ -127,9 +212,14 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         :param exported_program: The exported program.
         :return: A PartitionResult containing the partitioned graph and delegation tags.
         """
+        self.capture_nncf_patterns(exported_program.graph_module)
         partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
-            OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),
+            OpenvinoOperatorsSupport(
+                self._op_types_to_skip,
+                self._op_names_to_skip,
+                self._enabled_ops_by_name,
+            ),
             allows_single_node_partition=True,
         )
         partition_list = partitioner.propose_partitions()
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index c343f44a8b5..691115f6579 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -14,6 +14,8 @@
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from openvino.frontend.pytorch.torchdynamo.compile import (  # type: ignore[import-untyped]
     openvino_compile,
 )
@@ -36,6 +38,12 @@ def preprocess(
         Returns:
             PreprocessResult: The result of preprocessing, including the compiled model bytes.
         """
+        transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module)
+
+        # Update the edge_program with the transformed graph
+        if transformed_ep and transformed_ep.graph_module:
+            edge_program._graph_module = transformed_ep.graph_module
+
         input_names = edge_program.graph_signature.user_inputs
         args = []
         for node in edge_program.graph.nodes:
diff --git a/backends/openvino/quantizer/__init__.py b/backends/openvino/quantizer/__init__.py
index df038483f2f..5aae52ef3e8 100644
--- a/backends/openvino/quantizer/__init__.py
+++ b/backends/openvino/quantizer/__init__.py
@@ -1,3 +1,3 @@
-from .quantizer import OpenVINOQuantizer, quantize_model
+from .quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model
 
-__all__ = ["OpenVINOQuantizer", "quantize_model"]
+__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
diff --git a/backends/openvino/quantizer/observers.py b/backends/openvino/quantizer/observers.py
new file mode 100644
index 00000000000..6cda4561604
--- /dev/null
+++ b/backends/openvino/quantizer/observers.py
@@ -0,0 +1,186 @@
+# Copyright (c) Intel Corporation
+#
+# Licensed under the BSD License (the "License"); you may not use this file
+# except in compliance with the License. See the license file found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: disable-error-code=import-not-found
+
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple
+
+import torch
+
+from nncf.experimental.torch.fx.node_utils import (  # type: ignore[import-untyped]
+    get_tensor_constant_from_node,
+)
+from nncf.experimental.torch.fx.transformations import (  # type: ignore[import-untyped]
+    constant_update,
+    module_insertion,
+    node_removal,
+)
+from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
+    WeightCompressionParameters,
+)
+from nncf.quantization.algorithms.weight_compression.weight_lowering import (  # type: ignore[import-untyped]
+    do_integer_quantization,
+)
+from nncf.tensor.tensor import Tensor as NNCFTensor  # type: ignore[import-untyped]
+from nncf.torch.graph.transformations.commands import (  # type: ignore[import-untyped]
+    PTTargetPoint,
+    TargetType,
+)
+from nncf.torch.quantization.layers import (  # type: ignore[import-untyped]
+    BaseWeightsDecompressor,
+    INT4AsymmetricWeightsDecompressor,
+    INT4SymmetricWeightsDecompressor,
+    INT8AsymmetricWeightsDecompressor,
+    INT8SymmetricWeightsDecompressor,
+)
+from torchao.quantization.pt2e import ObserverBase
+
+
+class WeightObserverBase(ObserverBase, ABC):
+    """
+    Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
+    """
+
+    def __init__(
+        self,
+        wc_param: WeightCompressionParameters,
+        dtype: torch.dtype,
+        **kwargs,
+    ) -> None:
+        """
+        :param wc_param: Weight compression parameters container.
+        :param dtype: target dtype for the quantization.
+        """
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self._wc_param = wc_param
+
+    def calculate_qparams(  # type: ignore[override]
+        self,
+        weight: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Calculates quantization parameters: quantized weight, quantization scale and quantization zero point.
+
+        :param weight: FP weight to be used for calculating qparams.
+        :return: A tuple containing the quantized weight, quantization scale and quantization zero point.
+        """
+        wc_param = self._wc_param
+        wc_config = wc_param.compression_config
+        reduction_axes = wc_param.reduction_axes
+        q_weight, scale, zp = do_integer_quantization(
+            NNCFTensor(weight), wc_config, reduction_axes=reduction_axes
+        )
+        zp = zp.data if zp is not None else None
+        return q_weight.data, scale.data, zp
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+    def convert(
+        self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
+    ) -> None:
+        """
+        Replaces the given observer node from the given model with a quantized
+        weight and a OpenVINO specific decompression module.
+
+        :param model: A `torch.fx.GraphModule` representing the statically traced model
+                    with observer nodes attached and calibrated.
+        :param observer_node: The `torch.fx.Node` corresponding to the observer module for
+                            the weight that is being transformed into a compressed representation.
+        """
+        weight_node = observer_node.args[0]
+        original_weight = get_tensor_constant_from_node(weight_node, model)
+        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
+
+        decompressor = self._create_decompressor(
+            scale, zero_point, q_weight, original_weight
+        )
+        packed_q_weight = decompressor.pack_weight(q_weight)
+
+        # Weight port id is 0 since observer is inserted for a single weight only.
+        constant_update(model, observer_node, packed_q_weight, input_port_id=0)
+
+        compressed_weight_name = observer_node.all_input_nodes[0].name
+        decompressor_suffix = "_".join(
+            compressed_weight_name.replace(".", "_").split("_")[:-2]
+        )
+        decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
+
+        module_insertion(
+            model,
+            decompressor,
+            [
+                PTTargetPoint(
+                    TargetType.OPERATOR_POST_HOOK,
+                    target_node_name=compressed_weight_name,
+                )
+            ],
+            decompressor_name,
+        )
+        node_removal(model, observer_node, 0)
+
+    @abstractmethod
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        """
+        Returns a respective NNCF decompressor for different types of quantization.
+
+        :param scale: Calculated scale quantization parameter.
+        :param zero_point: Calculated zero_point quantization parameter.
+        :param q_weight: Calculated quantized weight.
+        :param original_weight: FP weight.
+        :return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
+        """
+
+
+class INT4WeightObserver(WeightObserverBase):
+    """
+    OpenVINO INT4 Weight Compression observer.
+    """
+
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        if zero_point is None:
+            return INT4SymmetricWeightsDecompressor(
+                scale, q_weight.shape, original_weight.shape, original_weight.dtype
+            )
+        return INT4AsymmetricWeightsDecompressor(
+            scale,
+            zero_point,
+            q_weight.shape,
+            original_weight.shape,
+            original_weight.dtype,
+        )
+
+
+class INT8WeightObserver(WeightObserverBase):
+    """
+    OpenVINO INT8 Weight Compression per channel observer.
+    """
+
+    def _create_decompressor(
+        self,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        q_weight: torch.Tensor,
+        original_weight: torch.Tensor,
+    ) -> BaseWeightsDecompressor:
+        if zero_point is None:
+            return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+        return INT8AsymmetricWeightsDecompressor(
+            scale, zero_point, original_weight.dtype
+        )
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
index edce272ff9b..bef1ef3274f 100644
--- a/backends/openvino/quantizer/quantizer.py
+++ b/backends/openvino/quantizer/quantizer.py
@@ -15,8 +15,17 @@
 import nncf.experimental.torch.fx as nncf_fx  # type: ignore[import-untyped]
 
 import torch.fx
-
+from executorch.backends.openvino.quantizer.observers import (
+    INT4WeightObserver,
+    INT8WeightObserver,
+)
 from nncf.common.graph.graph import NNCFGraph  # type: ignore[import-untyped]
+from nncf.quantization.algorithms.weight_compression.config import (  # type: ignore[import-untyped]
+    WeightCompressionParameters,
+)
+from nncf.quantization.quantize_model import (  # type: ignore[import-untyped]
+    get_weight_compression_configuration,
+)
 from torchao.quantization.pt2e import (
     HistogramObserver,
     PerChannelMinMaxObserver,
@@ -30,7 +39,8 @@
     Quantizer,
     SharedQuantizationSpec,
 )
-from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
+
+QUANT_ANNOTATION_KEY = "quantization_annotation"
 
 
 class QuantizationMode(Enum):
@@ -40,11 +50,19 @@ class QuantizationMode(Enum):
     - INT8_SYM: INT8 symmetric quantization for both activations and weights.
     - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
     - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
+    - INT8WO_SYM: INT8 symmetric quantization for weights only.
+    - INT8WO_ASYM: INT8 asymmetric quantization for weights only.
+    - INT4WO_SYM: INT4 symmetric quantization for weights only.
+    - INT4WO_ASYM: INT4 asymmetric quantization for weights only
     """
 
     INT8_SYM = "int8_sym"
     INT8_MIXED = "int8_mixed"
     INT8_TRANSFORMER = "int8_transformer"
+    INT8WO_SYM = "int8wo_sym"
+    INT8WO_ASYM = "int8wo_asym"
+    INT4WO_SYM = "int4wo_sym"
+    INT4WO_ASYM = "int4wo_asym"
 
 
 class OpenVINOQuantizer(Quantizer):
@@ -53,10 +71,17 @@ class OpenVINOQuantizer(Quantizer):
     optimally for the inference via OpenVINO.
     """
 
+    WEIGHTS_ONLY_COMPRESSION_MODES = (
+        QuantizationMode.INT4WO_SYM,
+        QuantizationMode.INT4WO_ASYM,
+        QuantizationMode.INT8WO_SYM,
+        QuantizationMode.INT8WO_ASYM,
+    )
+
     def __init__(
         self,
         *,
-        mode: Optional[QuantizationMode] = QuantizationMode.INT8_SYM,
+        mode: QuantizationMode = QuantizationMode.INT8_SYM,
         **kwargs,
     ):
         """
@@ -65,22 +90,37 @@ def __init__(
             - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
             - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
             Default value is INT8_SYM.
+            - INT4_SYM: Symmetric INT4 Weights-Only Compression
+            - INT4_ASYM: Asymmetric INT4 Weights-Only Compression
         :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
         """
-        if mode == QuantizationMode.INT8_SYM:
-            preset = quantization.structs.QuantizationPreset.PERFORMANCE
-            model_type = None
-        elif mode == QuantizationMode.INT8_MIXED:
-            preset = quantization.structs.QuantizationPreset.MIXED
-            model_type = None
+        self.mode = mode
+        if self.mode not in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+            if mode == QuantizationMode.INT8_SYM:
+                preset = quantization.structs.QuantizationPreset.PERFORMANCE
+                model_type = None
+            elif mode == QuantizationMode.INT8_MIXED:
+                preset = quantization.structs.QuantizationPreset.MIXED
+                model_type = None
+            else:
+                preset = None
+                model_type = nncf.parameters.ModelType.TRANSFORMER
+            self._algo = (
+                nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
+                    preset=preset, model_type=model_type, **kwargs
+                )
+            )
         else:
-            preset = None
-            model_type = nncf.parameters.ModelType.TRANSFORMER
-        self._min_max_algo = (
-            nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
-                preset=preset, model_type=model_type, **kwargs
+            weight_compression_configuration = get_weight_compression_configuration(
+                mode.value.replace(
+                    "wo", ""
+                ),  # Mode value has to match NNCF CompressWeightsMode
+                **kwargs,
+            )
+            subset_size = 1  # Doesn't really matter in this case since it is data-free. Should just be +ve
+            self._algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
+                subset_size=subset_size, **weight_compression_configuration
             )
-        )
 
     def set_ignored_scope(
         self,
@@ -101,7 +141,7 @@ def set_ignored_scope(
         :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
           in the model graph.
         """
-        self._min_max_algo.set_ignored_scope(
+        self._algo.set_ignored_scope(
             nncf.IgnoredScope(
                 names=names or [],
                 patterns=patterns or [],
@@ -114,27 +154,73 @@ def set_ignored_scope(
     def get_nncf_quantization_setup(
         self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
     ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
-        self._min_max_algo._set_backend_entity(model)
-        return self._min_max_algo.find_quantization_setup(model, nncf_graph)
+        self._algo._set_backend_entity(model)
+        return self._algo.find_quantization_setup(model, nncf_graph)
 
-    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
-        nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
-        quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
+    def _annotate_weight_compression(
+        self,
+        model: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
+        nncf_graph: NNCFGraph,
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+    ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+        """
+        Annotates the model graph with weight-only quantization specs.
 
-        graph = model.graph
-        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
-            defaultdict(QuantizationAnnotation)
+        Identifies compressible nodes in the NNCF graph and attaches the corresponding
+        TorchAO quantization specifications to their weight edges for later transformation.
+
+        :param model: The FX GraphModule to annotate.
+        :param graph: The underlying FX graph.
+        :param nncf_graph: The corresponding NNCF graph.
+        :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+        :return: Updated mapping of FX nodes with weight compression annotations.
+        """
+        self._algo.set_backend_entity(model)
+        all_wc_params, _ = self._algo.get_weight_compression_parameters(
+            model, nncf_graph
         )
 
+        for wc_param in all_wc_params:
+            node_with_weight = wc_param.node_with_weight
+            target_node = nncf_fx.node_utils.get_graph_node_by_name(
+                graph, node_with_weight.node_name
+            )
+            annotation = node_vs_torch_annotation[target_node]
+            edge_or_node = self._get_weight_edge(target_node, nncf_graph)
+            qspec = self._get_torch_ao_qspec_from_nncf_config_for_wc(wc_param=wc_param)
+            self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+
+        return node_vs_torch_annotation
+
+    def _annotate_post_training_quantization(
+        self,
+        model: torch.fx.GraphModule,
+        graph: torch.fx.Graph,
+        nncf_graph: NNCFGraph,
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation],
+    ) -> DefaultDict[torch.fx.Node, QuantizationAnnotation]:
+        """
+        Annotates the model graph with post-training quantization configurations.
+
+        :param model: The FX GraphModule to annotate.
+        :param graph: The underlying FX graph.
+        :param nncf_graph: The corresponding NNCF graph.
+        :param node_vs_torch_annotation: A mapping of FX nodes to quantization annotations.
+        :return: Updated mapping of FX nodes with post-training quantization annotations.
+        """
+        quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
+
         for qp in quantization_setup.quantization_points.values():
             edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                 graph, nncf_graph, qp, node_vs_torch_annotation
             )
-            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_qp(qp)
+            qspec: QuantizationSpecBase = (
+                self._get_torch_ao_qspec_from_nncf_config_for_ptq(qp)
+            )
             self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         for quantizer_ids in quantization_setup.unified_scale_groups.values():
-
             root_quantizer_id = self._get_unified_scales_root_quantizer_id(
                 nncf_graph, quantizer_ids, quantization_setup
             )
@@ -145,14 +231,12 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 for q_id in quantizer_ids
             ):
                 qps = [
-                    quantization_setup.quantization_points[q_id]
-                    for q_id in quantizer_ids
+                    quantization_setup.quantization_points[qid] for qid in quantizer_ids
                 ]
-                msg = (
+                raise nncf.InternalError(
                     "Different quantization configs are set to one unified scale group:"
                     f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
                 )
-                raise nncf.InternalError(msg)
 
             root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
                 graph, root_qp.insertion_point.target_node_name
@@ -165,16 +249,35 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 if quantizer_id == root_quantizer_id:
                     continue
 
-                qspec = SharedQuantizationSpec(root_edge_or_node)
+                qspec = SharedQuantizationSpec(root_edge_or_node)  # type: ignore[assignment]
                 qp = quantization_setup.quantization_points[quantizer_id]
                 edge_or_node, annotation = self._get_edge_or_node_and_annotation(
                     graph, nncf_graph, qp, node_vs_torch_annotation
                 )
                 self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
+        return node_vs_torch_annotation
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
+        graph = model.graph
+        node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
+            defaultdict(QuantizationAnnotation)
+        )
+
+        if self.mode in OpenVINOQuantizer.WEIGHTS_ONLY_COMPRESSION_MODES:
+            node_vs_torch_annotation = self._annotate_weight_compression(
+                model, graph, nncf_graph, node_vs_torch_annotation
+            )
+        else:
+            node_vs_torch_annotation = self._annotate_post_training_quantization(
+                model, graph, nncf_graph, node_vs_torch_annotation
+            )
+
         for node, annotation in node_vs_torch_annotation.items():
-            assert Q_ANNOTATION_KEY not in node.meta
-            node.meta[Q_ANNOTATION_KEY] = annotation
+            assert QUANT_ANNOTATION_KEY not in node.meta
+            node.meta[QUANT_ANNOTATION_KEY] = annotation
+
         return model
 
     @staticmethod
@@ -236,6 +339,35 @@ def _get_edge_or_node_and_annotation(
         edge_or_node = OpenVINOQuantizer._get_edge_or_node(target_node, qp, nncf_graph)
         return edge_or_node, annotation
 
+    @staticmethod
+    def _get_weight_edge(
+        target_node: torch.fx.Node,
+        nncf_graph: NNCFGraph,
+    ) -> tuple[torch.fx.Node, torch.fx.Node]:
+        """
+        Returns the FX node corresponding to the weight tensor input of a given operator node.
+        Uses the NNCF graph to identify which input port of the target node holds the weight.
+        If multiple weight ports are present, a warning is issued and only the first one is used.
+
+        :param target_node: FX node representing a weighted operation (e.g., Linear, Conv).
+        :param nncf_graph: NNCFGraph used to determine weight port indices.
+        :return: Edge represented by a Tuple of (weight_node, target_node), where weight_node is the FX node supplying the weight.
+        """
+        nncf_node = nncf_graph.get_node_by_name(target_node.name)
+        weights_ports_ids = nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
+            nncf_node, nncf_graph
+        )
+        if len(weights_ports_ids) > 1:
+            # TODO(dlyakhov): support quantization for nodes with several weights
+            nncf.common.logging.nncf_logger.warning(
+                f"Quantization of the weighted node {target_node.name}"
+                " is not yet supported by the OpenVINOQuantizer."
+                f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
+                f" Quantizable weights are located on ports: {weights_ports_ids}."
+            )
+        weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
+        return (weight_node, target_node)
+
     @staticmethod
     def _get_edge_or_node(
         target_node: torch.fx.Node,
@@ -252,22 +384,7 @@ def _get_edge_or_node(
         """
         ip = qp.insertion_point
         if qp.is_weight_quantization_point():
-            nncf_node = nncf_graph.get_node_by_name(target_node.name)
-            weights_ports_ids = (
-                nncf.torch.model_graph_manager.get_weight_tensor_port_ids(
-                    nncf_node, nncf_graph
-                )
-            )
-            if len(weights_ports_ids) > 1:
-                # TODO(dlyakhov): support quantization for nodes with several weights
-                nncf.common.logging.nncf_logger.warning(
-                    f"Quantization of the weighted node {target_node.name}"
-                    " is not yet supported by the OpenVINOQuantizer."
-                    f" Only the weight on port ID {weights_ports_ids[0]} will be quantized."
-                    f" Quantizable weights are located on ports: {weights_ports_ids}."
-                )
-            weight_node = target_node.all_input_nodes[weights_ports_ids[0]]
-            return (weight_node, target_node)
+            OpenVINOQuantizer._get_weight_edge(target_node, nncf_graph)
 
         if ip.input_port_id is None:
             return target_node
@@ -294,22 +411,78 @@ def _fill_torch_ao_annotation(
             annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec
 
     @staticmethod
-    def _get_torch_ao_qspec_from_qp(
+    def _get_torch_ao_qspec_from_nncf_config_for_wc(
+        wc_param: WeightCompressionParameters,
+    ) -> QuantizationSpec:
+        """
+        Returns a TorchAO QuantizationSpec based on NNCF weight compression parameter.
+
+        :param wc_param: NNCF Weight compression parameters for the node.
+        :return: A TorchAO QuantizationSpec.
+        """
+        observer: Type[UniformQuantizationObserverBase]
+
+        extra_args: Dict[str, Any] = {}
+
+        qmode = wc_param.compression_config.mode
+        extra_args["wc_param"] = wc_param
+        is_asym_mode = wc_param.compression_config.is_asym_mode
+        if qmode in [
+            nncf.CompressWeightsMode.INT4_ASYM,
+            nncf.CompressWeightsMode.INT4_SYM,
+        ]:
+            observer = INT4WeightObserver  # type: ignore[type-abstract]
+            quant_min = -8 if not is_asym_mode else 0
+            quant_max = 7 if not is_asym_mode else 15
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = torch_qscheme = (
+                torch.per_channel_symmetric
+                if not is_asym_mode
+                else torch.per_channel_affine
+            )
+        else:
+            observer = INT8WeightObserver  # type: ignore[type-abstract]
+            quant_min = -128 if not is_asym_mode else 0
+            quant_max = 127 if not is_asym_mode else 255
+            dtype = torch.int8
+            channel_axis = 0
+            torch_qscheme = (
+                torch.per_channel_symmetric
+                if not is_asym_mode
+                else torch.per_channel_affine
+            )
+        return QuantizationSpec(
+            dtype=dtype,
+            observer_or_fake_quant_ctr=observer.with_args(**extra_args),
+            quant_min=quant_min,
+            quant_max=quant_max,
+            qscheme=torch_qscheme,
+            ch_axis=channel_axis,
+            is_dynamic=False,
+        )
+
+    @staticmethod
+    def _get_torch_ao_qspec_from_nncf_config_for_ptq(
         qp: quantization.quantizer_setup.QuantizationPointBase,
     ) -> QuantizationSpec:
         """
-        Retrieves the quantization configuration from the given quantization point and
-        converts it into a QuantizationSpec.
+        Returns a TorchAO QuantizationSpec based on NNCF quantization point.
 
-        :param qp: An instance of QuantizationPointBase.
-        :return: A QuantizationSpec retrieved and converted from the quantization point.
+        :param qp: Quantization point from NNCF.
+        :return: A TorchAO QuantizationSpec.
         """
+        observer: Type[UniformQuantizationObserverBase]
+
         # Eps value is copied from nncf/torch/quantization/layers.py
-        extra_args = {"eps": 1e-16}
-        qconfig = qp.qconfig
-        is_weight = qp.is_weight_quantization_point()
+        extra_args: Dict[str, Any] = {"eps": 1e-16}
 
-        observer: Type[UniformQuantizationObserverBase]
+        is_weight = qp.is_weight_quantization_point()
+        qconfig = qp.qconfig
+        dtype = torch.int8
+        quant_min = None
+        quant_max = None
+        channel_axis = None
 
         if qconfig.per_channel:
             torch_qscheme = (
@@ -329,6 +502,11 @@ def _get_torch_ao_qspec_from_qp(
             quant_max = 127
             dtype = torch.int8
             channel_axis = 0
+            torch_qscheme = (
+                torch.per_channel_symmetric
+                if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+                else torch.per_channel_affine
+            )
         else:
             observer = (
                 HistogramObserver
diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 316633e9004..519818d0aac 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@3d753ac#egg=nncf
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index 8ec40d7f7c6..bac006ce916 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -114,6 +114,26 @@ exr::Error OpenvinoBackend::execute(
         ov_type, input_shape, input_tensor.mutable_data_ptr());
 
     infer_request->set_input_tensor(i, ov_input_tensor);
+
+    if (args[i]->isInt()) {
+      int64_t* val = &(args[i]->payload.copyable_union.as_int);
+
+      // Create OpenVINO tensor from integer input
+      ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
+      infer_request->set_input_tensor(i, ov_input_tensor);
+    } else {
+      auto input_tensor = args[i]->toTensor();
+      ov::Shape input_shape(
+          input_tensor.sizes().begin(), input_tensor.sizes().end());
+
+      // Convert input tensor to OpenVINO tensor
+      ov::element::Type ov_type =
+          convert_to_openvino_type(input_tensor.scalar_type());
+      ov::Tensor ov_input_tensor(
+          ov_type, input_shape, input_tensor.mutable_data_ptr());
+
+      infer_request->set_input_tensor(i, ov_input_tensor);
+    }
   }
 
   // Set outputs
@@ -165,10 +185,14 @@ ov::element::Type OpenvinoBackend::convert_to_openvino_type(
   switch (scalar_type) {
     case exa::ScalarType::Float:
       return ov::element::f32;
+    case exa::ScalarType::Half:
+      return ov::element::f16;
     case exa::ScalarType::Int:
       return ov::element::i32;
     case exa::ScalarType::Char:
       return ov::element::i8;
+    case exa::ScalarType::Byte:
+      return ov::element::u8;
     case exa::ScalarType::Long:
       return ov::element::i64;
     case exa::ScalarType::Bool:
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index 5a26f0b6dae..6d7853b96e5 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -7,55 +7,106 @@ set -e
 EXECUTORCH_ROOT=$(realpath "$(dirname "$0")/../../..")
 echo EXECUTORCH_ROOT=${EXECUTORCH_ROOT}
 
-main() {
-    build_type=${1:-"--cpp_runtime"}
+install_requirements() {
+    echo "Installing Requirements For OpenVINO Backend"
+    cd "$EXECUTORCH_ROOT"
+    pip install -r backends/openvino/requirements.txt
+}
 
-    # If the first arguments is --cpp_runtime (default), build libraries for C++ runtime
-    if [[ -z "$build_type" || "$build_type" == "--cpp_runtime" ]]; then
-        echo "Building C++ Runtime Libraries"
+build_cpp_runtime() {
+    echo "Building C++ Runtime Libraries"
+
+    # Set build directory
+    local build_dir="cmake-out"
+
+    # Enter the Executorch root directory
+    cd "$EXECUTORCH_ROOT"
+    rm -rf "${build_dir}"
+
+    # Configure the project with CMake
+    # Note: Add any additional configuration options you need here
+    cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DEXECUTORCH_BUILD_OPENVINO=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+          -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+          -B"${build_dir}"
+
+
+    # Build the project
+    cmake --build ${build_dir} --target install --config Release -j$(nproc)
+}
 
-        # Set build directory
-        local build_dir="cmake-out"
+build_llama_runner() {
+    echo "Building Export Llama Runner"
 
-        # Create and enter the build directory
-        cd "$EXECUTORCH_ROOT"
-        rm -rf "${build_dir}"
+    # Set build directory
+    local build_dir="cmake-out"
 
-        # Configure the project with CMake
-        # Note: Add any additional configuration options you need here
-        cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DEXECUTORCH_BUILD_OPENVINO=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-              -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
-              -B"${build_dir}"
+    # Enter the Executorch root directory
+    cd "$EXECUTORCH_ROOT"
 
+    # Configure the project with CMake
+    # Note: Add any additional configuration options you need here
+    cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -B"${build_dir}"/examples/models/llama \
+        examples/models/llama
+    # Build the export llama runner
+    cmake --build cmake-out/examples/models/llama -j$(nproc) --config Release
+}
 
-        # Build the project
-        cmake --build ${build_dir} --target install --config Release -j$(nproc)
+build_python_enabled() {
+    echo "Building Python Package with Pybinding"
 
-    # If the first arguments is --enable_python, build python package with python bindings
-    elif [[ "$build_type" == "--enable_python" ]]; then
-        echo "Building Python Package with Pybinding"
+    # Enter the Executorch root directory
+    cd "$EXECUTORCH_ROOT"
+    ./install_executorch.sh --clean
+
+    # Set parameters to configure the project with CMake
+    # Note: Add any additional configuration options you need here
+    export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON"
+    export CMAKE_BUILD_ARGS="--target openvino_backend"
 
-        # Create and enter the build directory
-        cd "$EXECUTORCH_ROOT"
-        ./install_executorch.sh --clean
+    # Build the package
+    ./install_executorch.sh --minimal
 
-        # Set parameters to configure the project with CMake
-        # Note: Add any additional configuration options you need here
-        export CMAKE_ARGS="-DEXECUTORCH_BUILD_OPENVINO=ON \
-                           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON"
-        export CMAKE_BUILD_ARGS="--target openvino_backend"
+    # Install torchao
+    pip install third-party/ao
+}
+
+main() {
+    build_type=${1:-"--build_all"}
+
+    # If the first arguments is --build_all (default), build python package, C++ runtime, and llama runner binary
+    if [[ -z "$build_type" || "$build_type" == "--build_all" ]]; then
+        install_requirements
+        build_python_enabled
+        build_cpp_runtime
+        build_llama_runner
 
-        # Build the package
-        ./install_executorch.sh --minimal
+    # If the first arguments is --cpp_runtime, build libraries for C++ runtime
+    elif [[ "$build_type" == "--cpp_runtime" ]]; then
+        build_cpp_runtime
 
-        # Install torchao
-        pip install third-party/ao
+    # If the first arguments is --llama_runner, build export llama runner binary
+    # Note: c++ runtime with openvino backend should be built before building export llama runner
+    elif [[ "$build_type" == "--llama_runner" ]]; then
+        build_llama_runner
+
+    # If the first arguments is --enable_python, build python package with python bindings
+    elif [[ "$build_type" == "--enable_python" ]]; then
+        install_requirements
+        build_python_enabled
 
     else
         echo "Error: Argument is not valid: $build_type"
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 32105597260..07166b92ea2 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -214,7 +214,7 @@ add_subdirectory(
 install(
   TARGETS qnn_executorch_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
 # QNN pybind
diff --git a/backends/qualcomm/__init__.py b/backends/qualcomm/__init__.py
index 04ba5fcf24b..5770dfb0fcd 100644
--- a/backends/qualcomm/__init__.py
+++ b/backends/qualcomm/__init__.py
@@ -1,23 +1,13 @@
 import os
 
-from .scripts.download_qnn_sdk import (
-    check_glibc_exist_and_validate,
-    install_qnn_sdk,
-    is_linux_x86,
-)
+from .scripts.download_qnn_sdk import install_qnn_sdk, is_linux_x86
 
 
 env_flag = os.getenv("EXECUTORCH_BUILDING_WHEEL", "0").lower()
 # If users have preinstalled QNN_SDK_ROOT, we will use it.
 qnn_sdk_root_flag = os.getenv("QNN_SDK_ROOT", None)
 
-if (
-    env_flag not in ("1", "true", "yes")
-    and not qnn_sdk_root_flag
-    and is_linux_x86()
-    and check_glibc_exist_and_validate()
-):
+if env_flag not in ("1", "true", "yes") and not qnn_sdk_root_flag and is_linux_x86():
     ok = install_qnn_sdk()
-
     if not ok:
         raise RuntimeError("Failed to install QNN SDK. Please check the logs above.")
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index 5d0ac832237..154a360689e 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -18,11 +18,13 @@
 from .decompose_col_im import DecomposeColIm
 from .decompose_einsum import DecomposeEinsum
 from .decompose_expm1 import DecomposeExpM1
+from .decompose_floor_divide import DecomposeFloorDivide
 from .decompose_glu import DecomposeGlu
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_minmaxdim import DecomposeMinMaxDim
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
+from .decompose_threshold import DecomposeThreshold
 from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
@@ -32,6 +34,7 @@
 from .i64_to_i32 import I64toI32
 from .insert_io_qdq import InsertIOQDQ
 from .insert_requantize import InsertRequantize
+from .insert_reshape_for_reduce_ops import InsertReshapeForReduceOps
 from .layout_transform import LayoutTransform
 from .lift_constant_scalar_operands import LiftConstantScalarOperands
 from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
@@ -44,7 +47,6 @@
 from .seq_mse import SeqMSE
 from .tag_quant_io import TagQuantIO
 
-
 __all__ = [
     AnnotateAdaptiveAvgPool1D,
     AnnotateQuantAttrs,
@@ -60,11 +62,13 @@
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeThreshold,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
@@ -73,6 +77,7 @@
     FuseConsecutiveTranspose,
     I64toI32,
     InsertIOQDQ,
+    InsertReshapeForReduceOps,
     InsertRequantize,
     LayoutTransform,
     LiftConstantScalarOperands,
diff --git a/backends/qualcomm/_passes/decompose_floor_divide.py b/backends/qualcomm/_passes/decompose_floor_divide.py
new file mode 100644
index 00000000000..f7de074259e
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_floor_divide.py
@@ -0,0 +1,62 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+class FloorDivide(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        dtype = x.dtype
+        result = torch.div(x, y)
+        result = torch.floor(result)
+        return result.to(dtype)
+
+
+class DecomposeFloorDivide(ExportPass):
+    """
+    Decompose for math equivalent op.
+    Since QNN does not support floor_divide operations for int32 or int64 inputs,
+    it is necessary to decompose the operation into a division using floating-point precision,
+    followed by applying the floor function.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            model = FloorDivide()
+            if (
+                torch.ops.aten.floor_divide.default == node.target
+                and not torch.is_floating_point(node.meta["val"])
+            ):
+                decomposed_module = torch.export.export(
+                    model,
+                    (node.args[0].meta["val"], node.args[1].meta["val"]),
+                    strict=True,
+                ).module()
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0], "y": node.args[1]}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_threshold.py b/backends/qualcomm/_passes/decompose_threshold.py
new file mode 100644
index 00000000000..0f0a1bc4ea8
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_threshold.py
@@ -0,0 +1,61 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+class DecomposeModule(torch.nn.Module):
+    def __init__(self, threshold, value):
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+
+    def forward(self, x):
+        return torch.where(x <= self.threshold, self.value, x)
+
+
+class DecomposeThreshold(ExportPass):
+    """
+    Decompose threshold to less_equal and where.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target in {
+                torch.ops.aten.threshold_.default,
+                torch.ops.aten.threshold.default,
+            }:
+                input_node = node.args[0]
+                threshold = node.args[1]
+                value = node.args[2]
+
+                model = DecomposeModule(threshold, value)
+                decomposed_module = torch.export.export(
+                    model, (input_node.meta["val"],), strict=True
+                ).module()
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": input_node}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py
new file mode 100644
index 00000000000..52f9546c28e
--- /dev/null
+++ b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+
+class InsertReshapeForReduceOps(ExportPass):
+    """
+    Rewrite `aten.argmax.default` with `dim=None` into
+    a reshape-to-1D followed by argmax(dim=0).
+
+    PyTorch semantics:
+      torch.argmax(x, dim=None) -> flatten(x) then argmax along axis=0
+
+    QNN requires an explicit axis, so we insert the reshape.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.op_map = {torch.ops.aten.argmax.default, torch.ops.aten.argmin.default}
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        modified = False
+
+        for n in graph.nodes:
+            if n.target in self.op_map:
+                dim_arg = None if len(n.args) == 1 else n.args[1]
+
+                if dim_arg is None:
+                    inp = n.args[0]
+
+                    # Insert reshape before argmax
+                    with graph.inserting_before(n):
+                        reshape_node = graph.create_node(
+                            "call_function",
+                            torch.ops.aten.reshape.default,
+                            (inp, [-1]),
+                            {},
+                        )
+                        reshape_node.meta = dict(inp.meta)
+                        if "val" in inp.meta:
+                            reshape_node.meta["val"] = inp.meta["val"].reshape(-1)
+
+                    # Rewrite argmax: take reshape_node as input, set dim=0
+                    n.args = (reshape_node, 0, *n.args[2:])
+
+                modified = True
+
+        if modified:
+            graph_module.recompile()
+            dead_code_elimination_pass(graph_module)
+
+        return PassResult(graph_module, modified)
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
index f5c5915cab2..52bdf7fa090 100644
--- a/backends/qualcomm/_passes/lift_constant_scalar_operands.py
+++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -51,6 +51,7 @@ class TensorOpInfo:
     # The scalar number arg[1] is missing when using default. Result in a corner case to deal
     aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True, False),
     aten.leaky_relu_.default: TensorOpInfo(aten.prelu.default, True, False),
+    aten.where.ScalarSelf: TensorOpInfo(aten.where.self, False, True),
     aten.where.ScalarOther: TensorOpInfo(aten.where.self, False, True),
     aten.where.Scalar: TensorOpInfo(aten.where.self, False, True),
     aten.masked_fill.Scalar: TensorOpInfo(aten.masked_fill.Tensor, False, False),
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index 6e1369326fa..360581a2929 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -23,11 +23,13 @@
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFloorDivide,
     DecomposeGlu,
     DecomposeLinalgVectorNorm,
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeThreshold,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
@@ -37,6 +39,7 @@
     I64toI32,
     InsertIOQDQ,
     InsertRequantize,
+    InsertReshapeForReduceOps,
     LayoutTransform,
     LiftConstantScalarOperands,
     RecomposePixelUnshuffle,
@@ -200,6 +203,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
+        self.add_pass(DecomposeThreshold())
         self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
@@ -207,6 +211,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(ReplaceInfValues())
         self.add_pass(LiftConstantScalarOperands())
+        self.add_pass(InsertReshapeForReduceOps())
         return self._transform(graph_module)
 
     def transform_for_export_pipeline(
@@ -216,8 +221,14 @@ def transform_for_export_pipeline(
         self.add_pass(DecomposeCDist())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
+        self.add_pass(DecomposeThreshold())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        # DecomposeFloorDivide does not apply to the annotation pipeline,
+        # since the CPU QDQ model would reduce accuracy.
+        # We keep div and floor operations in floating-point to maintain precision.
+        # This pass is needed before to_edge pipeline to avoid mixed type for div operator with RemoveMixedTypeOperators pass.
+        self.add_pass(DecomposeFloorDivide())
         self.add_pass(DecomposeWrapWithAutocast())
         # this pass will rewrite state_dict, it needs to be accomplished before
         # to_edge_transform_and_lower
@@ -226,6 +237,7 @@ def transform_for_export_pipeline(
             self.add_pass(ConvertLinearToConv2d(exported_program))
         self.add_pass(ConvertSquareToPow())
         self.add_pass(LiftConstantScalarOperands())
+        self.add_pass(InsertReshapeForReduceOps())
         self._transform(exported_program.graph_module)
         ep = lift_constant_tensor_pass(exported_program)
         return ep
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index bc2b62c8c0b..8cbf3a50e22 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -176,7 +176,7 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
         user_0 = self.get_first_user(node)
         if "convolution" in user_0.target.__name__:
             # OIHW (pytorch) -> HWIO (QNN)
-            quant_config[QCOM_AXIS] = 3
+            quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
             quant_config[QCOM_AXIS_ORDER] = (2, 3, 1, 0)
         elif "linear" in user_0.target.__name__:
             # OI (pytorch) -> OI (QNN)
@@ -218,7 +218,7 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
         user_0 = self.get_first_user(node)
         # Memory layout of QNN conv weight always ends in Output. Like conv2d is HWIO
         if "convolution" in user_0.target.__name__:
-            quant_config[QCOM_AXIS] = 3
+            quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
         else:
             quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS]
 
diff --git a/backends/qualcomm/builders/op_index_put.py b/backends/qualcomm/builders/op_index_put.py
index c3c42ed483a..23481894f0d 100644
--- a/backends/qualcomm/builders/op_index_put.py
+++ b/backends/qualcomm/builders/op_index_put.py
@@ -1,14 +1,19 @@
 import warnings
+from collections import OrderedDict
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 import numpy as np
 import torch
 
-from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_DATA,
+    QCOM_DTYPE,
+    QCOM_QUANT_ATTRS,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 
-from .node_visitor import NodeVisitor, QNN_TENSOR_TYPE_MAP
+from .node_visitor import NodeVisitor, QNN_QUANT_TYPE_MAP, QNN_TENSOR_TYPE_MAP
 from .node_visitor_manager import register_node_visitor
 from .qnn_constants import (
     OpConcat,
@@ -26,7 +31,7 @@ class IndexPutVisitor(NodeVisitor):
     def __init__(self, *args) -> None:
         super().__init__(*args)
 
-    def define_node(
+    def define_node(  # noqa: C901
         self,
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
@@ -37,6 +42,7 @@ def define_node(
         if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
             quant_attrs = quant_attrs.copy()
             input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -46,52 +52,110 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        indicies_node = node.args[1]
-        index_node_dim = None
-        index_nodes = []
-        index_tensors = []
+        indices_nodes = (
+            node.args[1] if isinstance(node.args[1], list) else [node.args[1]]
+        )
         target_index = []
+        all_range_index = OrderedDict()
+        index_dtype = [
+            node.meta["val"].dtype for node in indices_nodes if node is not None
+        ][0]
+
+        # preprocess:
+        # - broadcast dimension for multiple specified index
+        # - broadcast specified index if dimensions are not matched
+        max_indices_in_specified_index = 0
+        for index, idx_node in enumerate(indices_nodes):
+            if isinstance(idx_node, torch.fx.Node):
+                last_specified_index_node = index
+                if max_indices_in_specified_index < idx_node.meta["val"].nelement():
+                    max_indices_in_specified_index = idx_node.meta["val"].nelement()
         # If there is None in a list, it means all range at that dimension
-        # E.g., indicies_node: [None, None, aten__to_copy_default_1]
-        if isinstance(indicies_node, list):
-            for index, idx_node in enumerate(indicies_node):
-                # First, collect the indice_node and index of None to construct the shape of index node
-                # E.g., shape of input: [1, 1024, 12, 64]
-                # For "None" axis (assume indicies_node: [None, None, aten__to_copy_default_1]),
-                # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2
-                if isinstance(idx_node, torch.fx.Node):
-                    index_nodes.append(idx_node)
-                    index_tensors.append(self.get_tensor(idx_node, idx_node))
-                    target_index.extend(index_tensors[-1].size())
-                    index_node_dim = index
-                elif idx_node is None and index_node_dim is None:
-                    # E.g., indicies_node: [None, aten__to_copy_default_1, None]
-                    # Don't need to consider "None" after index_node.
-                    target_index.append(input_tensor.size(index))
-                else:
-                    warnings.warn(
-                        f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None",
-                        stacklevel=1,
+        for index, idx_node in enumerate(indices_nodes):
+            # First, collect the index_node and index of None to construct the shape of index node
+            # E.g., shape of input: [1, 1024, 12, 64]
+            # For "None" axis (assume indices_node: [None, None, aten__to_copy_default_1]),
+            # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2
+            if isinstance(idx_node, torch.fx.Node):
+                # e.g. for case [index_node_0, None, index_node_1], nodes will have the same number of indices
+                target_index.append(
+                    self.get_tensor(idx_node, idx_node).nelement()
+                    if last_specified_index_node == index
+                    else 1
+                )
+            elif idx_node is None:
+                # E.g., indices_node: [None, None, aten__to_copy_default_1]
+                all_range_index[index] = torch.arange(
+                    input_tensor.size(index), dtype=index_dtype
+                )
+                target_index.append(input_tensor.size(index))
+            else:
+                warnings.warn(
+                    f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None",
+                    stacklevel=1,
+                )
+                return
+
+        # preprocess all range indices if any
+        if None in indices_nodes:
+            all_range_tensor = torch.cartesian_prod(*all_range_index.values())
+            # repeat all_range_tensor interleavely for future concatenation
+            # e.g. input_node = [5, 4, 3, 2], indices = [index_0_node, None, index_2_node]
+            #      index_0.shape == index_2.shape == 2 (will guarantee this condition)
+            #      where user specified (3, 4) for index_0, (0, 1) for index_2
+            # ---
+            # we should have all_range_tensor: [0, 1, 2, 3]
+            # repeat interleavely with 2 to match future tiled index_0_node & index_2_node
+            # we'll have 1(index_0 -> same as index_2)*4(index_1)*2(index_2) indices in total:
+            # | index_0_node | None | index_2_node |
+            # | 3            | 0    | 0            |
+            # | 4            | 0    | 1            |
+            # | 3            | 1    | 0            |
+            # | 4            | 1    | 1            |
+            # | 3            | 2    | 0            |
+            # | 4            | 2    | 1            |
+            # | 3            | 3    | 0            |
+            # | 4            | 3    | 1            |
+            all_range_tensor_aug = all_range_tensor.repeat_interleave(
+                max_indices_in_specified_index, dim=0
+            )
+            for index in all_range_index.keys():
+                # Repeat index for "None" axis in indices_nodes
+                range_index_node = torch.fx.Node(
+                    node.graph,
+                    node.name + f"_all_range_index_{index}",
+                    "call_function",
+                    exir_ops.edge.aten.tensor.default,
+                    (),  # args
+                    {},  # kwargs
+                )
+                range_indices = (
+                    (
+                        all_range_tensor_aug[:, index]
+                        if all_range_tensor_aug.dim() > 1
+                        else
+                        # if there is only one None
+                        all_range_tensor_aug
                     )
-                    return
-        # Assume that there is only one node in list
-        assert len(index_nodes) == 1, "Not support multiple indices tensor"
-        indice_node = index_nodes[0]
-        indice_tensor = index_tensors[0]
-        indices_tensor_wrapper = self.define_tensor(
-            indice_node,
-            node,
-            indice_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            nodes_to_wrappers,
-        )
+                    .reshape(-1, 1)
+                    .contiguous()
+                )
+                target_index_tensor_wrapper = self.define_tensor(
+                    range_index_node,
+                    node,
+                    range_indices,
+                    PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                    nodes_to_wrappers,
+                )
+                # store it for future concatenation
+                all_range_index[index] = (range_indices, target_index_tensor_wrapper)
 
         # Need to reconstruct the index tensor.
         # E.g., based on ScatterND Op Def in QNN Docs.
         # Torch:
         #   Given that
         #     shape of input: [1, 12, 1024, 64]
-        #     indicies_node: [None, None, aten__to_copy_default_1]
+        #     indices_node: [None, None, aten__to_copy_default_1]
         #     shape of aten__to_copy_default_1: [1]
         # QNN:
         #   Index tensor:
@@ -104,113 +168,135 @@ def define_node(
         #   update_indices = indices.shape[:-1]
         #   for idx in np.ndindex(update_indices):
         #       output[indices[idx]] = updates[idx]
+        specified_index = OrderedDict()
+        for i, indices_node in enumerate(indices_nodes):
+            if indices_node is None:
+                continue
 
-        # Append one dimension to specify x-tuple
-        index_shape = target_index + [1]
-        # Reshape the index_node for tile op
-        reshape_shape = [
-            shape if id == index_node_dim else 1 for id, shape in enumerate(index_shape)
-        ]
-        reshape_output_tensor = indice_tensor.reshape(reshape_shape)
-        reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-            node_name=node.name + "_reshape",
-            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype],
-            quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
-            quant_configs={},
-            dims=reshape_output_tensor.size(),
-            tensor=reshape_output_tensor,
-            is_fake_tensor=True,
-            nodes_to_wrappers=nodes_to_wrappers,
-        )
-        reshape_op = PyQnnWrapper.PyQnnOpWrapper(
-            node.name,
-            QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpReshape.op_name,
-        )
-        reshape_op.AddInputTensors([indices_tensor_wrapper])
-        reshape_op.AddOutputTensors([reshape_output_tensor_wrapper])
-        op_wrapper_list.append(reshape_op)
-        index_put_index_input_tensor_wrapper = reshape_output_tensor_wrapper
+            indices_tensor = self.get_tensor(indices_node, indices_node)
+            indices_tensor_wrapper = self.define_tensor(
+                indices_node,
+                node,
+                indices_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                nodes_to_wrappers,
+            )
+            if indices_tensor.nelement() < max_indices_in_specified_index:
+                # broadcast the specified index
+                indices_tensor = indices_tensor.repeat(max_indices_in_specified_index)
+                indices_multiples = [max_indices_in_specified_index]
+                indices_multiples_shape = [len(indices_multiples)]
+                indices_tile_tensor_wrapper = self.define_custom_tensor_wrapper(
+                    node_name=node.name + f"_indices_tile_{i}",
+                    tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                    dtype=QNN_TENSOR_TYPE_MAP[indices_tensor.dtype],
+                    quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    quant_configs={},
+                    dims=indices_tensor.size(),
+                    tensor=indices_tensor,
+                    is_fake_tensor=True,
+                    nodes_to_wrappers=nodes_to_wrappers,
+                )
+                tile_op = PyQnnWrapper.PyQnnOpWrapper(
+                    node.name,
+                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                    OpTile.op_name,
+                )
+                tile_op.AddInputTensors([indices_tensor_wrapper])
+                tile_op.AddOutputTensors([indices_tile_tensor_wrapper])
+                tile_op.AddTensorParam(
+                    OpTile.param_multiples,
+                    PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                    len(indices_multiples_shape),
+                    indices_multiples_shape,
+                    np.array(indices_multiples, dtype=np.uint32),
+                    True,
+                )
+                op_wrapper_list.append(tile_op)
+                indices_tensor_wrapper = indices_tile_tensor_wrapper
 
-        # Tile the index_node and concat the target index
-        if None in indicies_node:
-            tile_output_tensor = reshape_output_tensor.expand(index_shape)
-            # Tile the index_node to align with the shape of target_index
-            # Only need to tile the dim of None axis
-            # E.g., indicies_node: [None, None, aten__to_copy_default_1]
-            # Should tile the first two dimension.
-            multiples = [
-                shape if id != index_node_dim else 1
-                for id, shape in enumerate(index_shape)
-            ]
-            multiples_shape = [len(index_shape)]
-            tile_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-                node_name=node.name + "_tile",
+            # Append one dimension to specify x-tuple
+            # Reshape the index_node for tile op
+            reshape_shape = list(indices_tensor.shape) + [1]
+            reshape_output_tensor = indices_tensor.reshape(reshape_shape)
+            reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper(
+                node_name=node.name + f"_reshape_{i}",
                 tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-                dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype],
+                dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype],
                 quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
                 quant_configs={},
-                dims=tile_output_tensor.size(),
-                tensor=tile_output_tensor,
+                dims=reshape_output_tensor.size(),
+                tensor=reshape_output_tensor,
                 is_fake_tensor=True,
                 nodes_to_wrappers=nodes_to_wrappers,
             )
-            tile_op = PyQnnWrapper.PyQnnOpWrapper(
+            reshape_op = PyQnnWrapper.PyQnnOpWrapper(
                 node.name,
                 QNN_OP_PACKAGE_NAME_QTI_AISW,
-                OpTile.op_name,
+                OpReshape.op_name,
             )
-            tile_op.AddInputTensors([reshape_output_tensor_wrapper])
-            tile_op.AddOutputTensors([tile_output_tensor_wrapper])
-            tile_op.AddTensorParam(
-                OpTile.param_multiples,
-                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
-                len(multiples_shape),
-                multiples_shape,
-                np.array(multiples, dtype=np.uint32),
-                True,
-            )
-            op_wrapper_list.append(tile_op)
+            reshape_op.AddInputTensors([indices_tensor_wrapper])
+            reshape_op.AddOutputTensors([reshape_output_tensor_wrapper])
+            op_wrapper_list.append(reshape_op)
+            index_tensor_wrapper = reshape_output_tensor_wrapper
+            index_tensor = reshape_output_tensor
 
-            # Repeat index for "None" axis in indicies_node
-            ranges = [
-                torch.arange(dim, dtype=indice_tensor.dtype)
-                for dim in target_index[:-1]
-            ]
-            target_index_shape = target_index + [len(ranges)]
-            target_index_tensor = torch.cartesian_prod(*ranges)
-            reshape_target_index_shape = [
-                shape if id != index_node_dim else 1
-                for id, shape in enumerate(target_index_shape)
-            ]
-            target_index_tensor = target_index_tensor.reshape(
-                reshape_target_index_shape
-            )
-            target_index_tensor = target_index_tensor.expand(
-                target_index_shape
-            ).contiguous()
-            target_index_node = torch.fx.Node(
-                node.graph,
-                node.name + "_target_index",
-                "call_function",
-                exir_ops.edge.aten.tensor.default,
-                (),  # args
-                {},  # kwargs
-            )
-            target_index_tensor_wrapper = self.define_tensor(
-                target_index_node,
-                node,
-                target_index_tensor,
-                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-                nodes_to_wrappers,
-            )
+            # Tile the index_node and concat the target index
+            if None in indices_nodes:
+                tile_output_tensor = reshape_output_tensor.repeat(
+                    all_range_tensor.size(0), 1
+                )
+                # Tile the index_node to align with the shape of target_index
+                # Only need to tile the dim of None axis
+                # E.g., indices_node: [None, None, aten__to_copy_default_1]
+                # Should tile the number of indices combination of first two dimension
+                # times number of indices specified by aten__to_copy_default_1
+                multiples = [all_range_tensor.size(0), 1]
+                multiples_shape = [len(multiples)]
+                tile_output_tensor_wrapper = self.define_custom_tensor_wrapper(
+                    node_name=node.name + f"_tile_{i}",
+                    tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                    dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype],
+                    quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    quant_configs={},
+                    dims=tile_output_tensor.size(),
+                    tensor=tile_output_tensor,
+                    is_fake_tensor=True,
+                    nodes_to_wrappers=nodes_to_wrappers,
+                )
+                tile_op = PyQnnWrapper.PyQnnOpWrapper(
+                    node.name,
+                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                    OpTile.op_name,
+                )
+                tile_op.AddInputTensors([reshape_output_tensor_wrapper])
+                tile_op.AddOutputTensors([tile_output_tensor_wrapper])
+                tile_op.AddTensorParam(
+                    OpTile.param_multiples,
+                    PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                    len(multiples_shape),
+                    multiples_shape,
+                    np.array(multiples, dtype=np.uint32),
+                    True,
+                )
+                op_wrapper_list.append(tile_op)
+                index_tensor_wrapper = tile_output_tensor_wrapper
+                index_tensor = tile_output_tensor
 
-            # Concat target_index and tile output to reconstruct index_node
-            # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype
-            concat_output_tensor = torch.concat(
-                (target_index_tensor, tile_output_tensor), dim=-1
+            specified_index[i] = (index_tensor, index_tensor_wrapper)
+
+        # Concat target_index and tile output to reconstruct index_node
+        # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype
+        index_tensors, index_tensor_wrappers = [], []
+        for i, arg in enumerate(indices_nodes):
+            tensor, tensor_wrapper = (
+                all_range_index[i] if arg is None else specified_index[i]
             )
+            index_tensors.append(tensor)
+            index_tensor_wrappers.append(tensor_wrapper)
+
+        if len(index_tensor_wrappers) > 1:
+            concat_output_tensor = torch.concat(index_tensors, dim=-1)
             concat_output_tensor_wrapper = self.define_custom_tensor_wrapper(
                 node_name=node.name + "_concat",
                 tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
@@ -227,9 +313,7 @@ def define_node(
                 QNN_OP_PACKAGE_NAME_QTI_AISW,
                 OpConcat.op_name,
             )
-            concat_op.AddInputTensors(
-                [target_index_tensor_wrapper, tile_output_tensor_wrapper]
-            )
+            concat_op.AddInputTensors(index_tensor_wrappers)
             concat_op.AddOutputTensors([concat_output_tensor_wrapper])
             concat_op.AddScalarParam(
                 OpConcat.param_axis,
@@ -237,7 +321,6 @@ def define_node(
                 {QCOM_DATA: np.uint32(concat_output_tensor.dim() - 1)},
             )
             op_wrapper_list.append(concat_op)
-            index_put_index_input_tensor_wrapper = concat_output_tensor_wrapper
 
         value_node = self.get_node(node.args[2])
         value_tensor = self.get_tensor(value_node, node)
@@ -248,6 +331,94 @@ def define_node(
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
+        # handle broadcast scenario
+        # e.g. input_tensor: (1, 12, 1024, 64), value_tensor: (1, 64)
+        #      => value_reshape_tensor: (1, 1, 1, 64)
+        new_value_shape = (
+            *([1] * (input_tensor.dim() - value_tensor.dim())),
+            *value_tensor.shape,
+        )
+        # reshape the value_node for tile op
+        value_quant_encoding, value_quant_configs = self.get_quant_encoding_conf(
+            value_node, node
+        )
+        value_dtype = (
+            QNN_TENSOR_TYPE_MAP[value_tensor.dtype]
+            if value_quant_encoding
+            == PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED
+            else QNN_QUANT_TYPE_MAP[
+                (
+                    torch.uint16
+                    if value_quant_configs[QCOM_DTYPE] == torch.int32
+                    else value_quant_configs[QCOM_DTYPE]
+                )
+            ]
+        )
+        value_reshape_tensor = value_tensor.reshape(new_value_shape)
+        value_reshape_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_value_reshape",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=value_dtype,
+            quant_encoding=value_quant_encoding,
+            quant_configs=value_quant_configs,
+            dims=value_reshape_tensor.size(),
+            tensor=value_reshape_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        value_reshape_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        value_reshape_op.AddInputTensors([value_tensor_wrapper])
+        value_reshape_op.AddOutputTensors([value_reshape_tensor_wrapper])
+        op_wrapper_list.append(value_reshape_op)
+
+        # e.g. input_tensor: (1, 12, 1024, 64), index_tensor: (None, None, 2), value_tensor: (1, 64)
+        #      => multiples: [1, 12, 2, 1]
+        value_multiples = []
+        for i in range(input_tensor.dim() - 1, -1, -1):
+            if i in specified_index:
+                # all user specified index node wil have the same dimension
+                multiplier = (
+                    indices_nodes[i].meta["val"].nelement() // new_value_shape[i]
+                    if i == last_specified_index_node
+                    else 1
+                )
+            else:
+                multiplier = input_tensor.shape[i] // new_value_shape[i]
+            value_multiples.insert(0, multiplier)
+
+        value_tile_tensor = value_reshape_tensor.repeat(value_multiples)
+        value_multiples_shape = [len(value_multiples)]
+        value_tile_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_value_tile",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=value_dtype,
+            quant_encoding=value_quant_encoding,
+            quant_configs=value_quant_configs,
+            dims=value_tile_tensor.size(),
+            tensor=value_tile_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        value_tile_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpTile.op_name,
+        )
+        value_tile_op.AddInputTensors([value_reshape_tensor_wrapper])
+        value_tile_op.AddOutputTensors([value_tile_tensor_wrapper])
+        value_tile_op.AddTensorParam(
+            OpTile.param_multiples,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(value_multiples_shape),
+            value_multiples_shape,
+            np.array(value_multiples, dtype=np.uint32),
+            True,
+        )
+        op_wrapper_list.append(value_tile_op)
 
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
@@ -263,11 +434,46 @@ def define_node(
             QNN_OP_PACKAGE_NAME_QTI_AISW,
             OpScatterNd.op_name,
         )
+        # accumulation
+        if len(node.args) > 3 and node.args[3]:
+            index_put_op.AddScalarParam(
+                OpScatterNd.param_reduction,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                {QCOM_DATA: 1},
+            )
+
+        # check final index_input tensor
+        index_input_tensor, index_input_tensor_wrapper = (
+            (concat_output_tensor, concat_output_tensor_wrapper)
+            if len(index_tensor_wrappers) > 1
+            else specified_index[last_specified_index_node]
+        )
+        target_index_reshape_tensor = index_input_tensor.reshape((*target_index, -1))
+        target_index_reshape_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_target_index_reshape",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=QNN_TENSOR_TYPE_MAP[target_index_reshape_tensor.dtype],
+            quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+            quant_configs={},
+            dims=target_index_reshape_tensor.size(),
+            tensor=target_index_reshape_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        target_index_reshape_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        target_index_reshape_op.AddInputTensors([index_input_tensor_wrapper])
+        target_index_reshape_op.AddOutputTensors([target_index_reshape_tensor_wrapper])
+        op_wrapper_list.append(target_index_reshape_op)
+
         index_put_op.AddInputTensors(
             [
                 input_tensor_wrapper,
-                index_put_index_input_tensor_wrapper,
-                value_tensor_wrapper,
+                target_index_reshape_tensor_wrapper,
+                value_tile_tensor_wrapper,
             ]
         )
         index_put_op.AddOutputTensors([output_tensor_wrapper])
diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py
index 630b1b0b8de..10644e17c79 100644
--- a/backends/qualcomm/builders/op_mean_dim.py
+++ b/backends/qualcomm/builders/op_mean_dim.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import cast, Dict, List
+from typing import cast, Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
@@ -40,7 +40,22 @@ def define_node(
         )
 
         # mean dims and keep dims
-        mean_dims = cast(List[int], node.args[1])
+        rank = len(input_node.meta["val"].shape)
+
+        if rank == 0:
+            raise RuntimeError(
+                "Mean doesn't support 0d input, please report a bug in https://github.com/pytorch/executorch/issues"
+            )
+
+        dim_arg = node.args[1]
+
+        if dim_arg is None or len(dim_arg) == 0:
+            mean_dims = list(range(rank))  # reduce over all dims
+        elif isinstance(dim_arg, int):
+            mean_dims = [dim_arg]
+        else:
+            mean_dims = list(dim_arg)
+
         mean_dims = [
             mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims
         ]
diff --git a/backends/qualcomm/builders/op_transpose.py b/backends/qualcomm/builders/op_transpose.py
index dbed10ced46..e7fd84e8e79 100644
--- a/backends/qualcomm/builders/op_transpose.py
+++ b/backends/qualcomm/builders/op_transpose.py
@@ -42,6 +42,8 @@ def define_node(
 
         # permutation
         permute_order = cast(List[int], node.args[1])
+        # to prevent negative values
+        permute_order = [x % len(permute_order) for x in permute_order]
         permute_order_shape = [len(permute_order)]
 
         output_tensor = input_tensor.permute(permute_order)
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index 7a2924fe756..0a947759538 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -17,6 +17,7 @@
 to_be_implemented_operator = [
     exir_ops.edge.aten._adaptive_avg_pool3d.default,
     exir_ops.edge.aten.adaptive_max_pool2d.default,
+    exir_ops.edge.aten.adaptive_max_pool3d.default,
     exir_ops.edge.aten.avg_pool3d.default,
     exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.log10.default,
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index 6f1ef47c2ee..cf403a1a76d 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -1358,7 +1358,7 @@ def annotate_chunk(node: Node, quantization_config: QuantizationConfig) -> None:
         )
 
 
-@register_annotator([torch.ops.aten.where.self])
+@register_annotator([torch.ops.aten.where.self, torch.ops.aten.where.ScalarSelf])
 def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
@@ -1368,7 +1368,6 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
         assert isinstance(input_node, Node)
         if _is_float_tensor(input_node):
             input_qspec_map[input_node] = quantization_config.input_activation
-
     node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
         input_qspec_map=input_qspec_map,
         output_qspec=(
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
index 30af923781a..694fab3dc6b 100644
--- a/backends/qualcomm/quantizer/qconfig.py
+++ b/backends/qualcomm/quantizer/qconfig.py
@@ -200,7 +200,7 @@ def get_16a8w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-20}
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -398,7 +398,7 @@ def get_ptq_per_block_quant_config(
 def get_8a8w_qnn_qat_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.uint8,
         qscheme=(
             torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
@@ -458,7 +458,7 @@ def get_8a8w_qnn_qat_config(
 def get_16a4w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
@@ -541,7 +541,7 @@ def get_qat_per_channel_quant_config(
         # If zero_point is 128, htp can do optimizations.
         # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
         # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             qscheme=torch.per_tensor_symmetric,
             observer=act_observer,
@@ -553,7 +553,7 @@ def get_qat_per_channel_quant_config(
             observer_or_fake_quant_ctr=act_fake_quant_ctr,
         )
     else:
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             quant_min=torch.iinfo(act_dtype).min,
             quant_max=torch.iinfo(act_dtype).max,
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 5943b54d968..44d129d5544 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -161,6 +161,7 @@ def __post_init__(self):
                 {
                     torch.ops.aten.conv1d.default,
                     torch.ops.aten.conv2d.default,
+                    torch.ops.aten.conv3d.default,
                     torch.ops.aten.conv_transpose2d.input,
                 }
             )
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index c84911cf851..4cdd1efe6f4 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -86,6 +86,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
@@ -155,6 +156,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
diff --git a/backends/qualcomm/scripts/download_qnn_sdk.py b/backends/qualcomm/scripts/download_qnn_sdk.py
index 35006a41433..747524a0e5b 100644
--- a/backends/qualcomm/scripts/download_qnn_sdk.py
+++ b/backends/qualcomm/scripts/download_qnn_sdk.py
@@ -6,12 +6,15 @@
 import platform
 import re
 import shutil
+import subprocess
+import sys
 import tarfile
 import tempfile
 import urllib.request
 import zipfile
 from typing import Dict, List, Optional, Tuple
 
+
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
@@ -34,68 +37,81 @@ def is_linux_x86() -> bool:
     )
 
 
-import subprocess
+#########################
+# Cache directory helper
+#########################
 
-MINIMUM_LIBC_VERSION = 2.29
+APP_NAMESPACE = ["executorch", "qnn"]
 
-REQUIRED_LIBC_LIBS = [
-    "/lib/x86_64-linux-gnu/libc.so.6",
-    "/lib64/libc.so.6",
-    "/lib/libc.so.6",
-]
 
+def _get_staging_dir(*parts: str) -> pathlib.Path:
+    r"""
+    Return a cross-platform staging directory for staging SDKs/libraries.
+
+    - On Linux:
+        ~/.cache/executorch/qnn/<parts...>
+        (falls back to $HOME/.cache if $XDG_CACHE_HOME is unset)
 
-def check_glibc_exist_and_validate() -> bool:
+    - On Windows (not supported yet, but as placeholder):
+        %LOCALAPPDATA%\executorch\qnn\<parts...>
+        (falls back to $HOME/AppData/Local if %LOCALAPPDATA% is unset)
+
+    - Override:
+        If QNN_STAGING_DIR is set in the environment, that path is used instead.
+
+    Args:
+        parts (str): Subdirectories to append under the root staging dir.
+
+    Returns:
+        pathlib.Path: Fully qualified staging path.
     """
-    Check if users have glibc installed.
+    # Environment override wins
+    base = os.environ.get("QNN_STAGING_DIR")
+    if base:
+        return pathlib.Path(base).joinpath(*parts)
+
+    system = platform.system().lower()
+    if system == "windows":
+        # On Windows, prefer %LOCALAPPDATA%, fallback to ~/AppData/Local
+        base = pathlib.Path(
+            os.environ.get("LOCALAPPDATA", pathlib.Path.home() / "AppData" / "Local")
+        )
+    elif is_linux_x86():
+        # On Linux/Unix, prefer $XDG_CACHE_HOME, fallback to ~/.cache
+        base = pathlib.Path(
+            os.environ.get("XDG_CACHE_HOME", pathlib.Path.home() / ".cache")
+        )
+    else:
+        raise ValueError(f"Unsupported platform: {system}")
+
+    return base.joinpath(*APP_NAMESPACE, *parts)
+
+
+def _atomic_download(url: str, dest: pathlib.Path):
     """
-    exists = False
-    for path in REQUIRED_LIBC_LIBS:
-        try:
-            output = subprocess.check_output(
-                [path, "--version"], stderr=subprocess.STDOUT
-            )
-            output = output.decode().split("\n")[0]
-            logger.debug(f"[QNN] glibc version for path {path} is: {output}")
-            match = re.search(r"version (\d+\.\d+)", output)
-            if match:
-                version = match.group(1)
-                if float(version) >= MINIMUM_LIBC_VERSION:
-                    logger.debug(f"[QNN] glibc version is {version}.")
-                    exists = True
-                    return True
-                else:
-                    logger.error(
-                        f"[QNN] glibc version is too low. The minimum libc version is {MINIMUM_LIBC_VERSION} Please install glibc following the commands below."
-                    )
-            else:
-                logger.error("[QNN] glibc version not found.")
+    Download URL into dest atomically:
+      - Write to a temp file in the same dir
+      - Move into place if successful
+    """
+    dest.parent.mkdir(parents=True, exist_ok=True)
 
-        except Exception:
-            continue
+    # Temp file in same dir (guarantees atomic rename)
+    with tempfile.NamedTemporaryFile(dir=dest.parent, delete=False) as tmp:
+        tmp_path = pathlib.Path(tmp.name)
 
-    if not exists:
-        logger.error(
-            r""""
-            [QNN] glibc not found or the version is too low. Please install glibc following the commands below.
-            Ubuntu/Debian:
-                sudo apt update
-                sudo apt install libc6
-
-            Fedora/Red Hat:
-                sudo dnf install glibc
-
-            Arch Linux:
-                sudo pacman -S glibc
-            
-            Also please make sure the glibc version is >= MINIMUM_LIBC_VERSION. You can verify the glibc version by running the following command:
-            Option 1:
-                ldd --version
-            Option 2:
-                /path/to/libc.so.6 --version
-            """
-        )
-    return exists
+    try:
+        urllib.request.urlretrieve(url, tmp_path)
+        tmp_path.replace(dest)  # atomic rename
+    except Exception:
+        # Clean up partial file on failure
+        if tmp_path.exists():
+            tmp_path.unlink(missing_ok=True)
+        raise
+
+
+####################
+# qnn sdk download management
+####################
 
 
 def _download_archive(url: str, archive_path: pathlib.Path) -> bool:
@@ -178,9 +194,6 @@ def _download_qnn_sdk(dst_folder=SDK_DIR) -> Optional[pathlib.Path]:
     if not is_linux_x86():
         logger.info("[QNN] Skipping Qualcomm SDK (only supported on Linux x86).")
         return None
-    elif not check_glibc_exist_and_validate():
-        logger.info("[QNN] Skipping Qualcomm SDK (glibc not found or version too old).")
-        return None
     else:
         logger.info("[QNN] Downloading Qualcomm SDK for Linux x86")
 
@@ -241,6 +254,136 @@ def _extract_tar(archive_path: pathlib.Path, prefix: str, target_dir: pathlib.Pa
                     dst.write(src.read())
 
 
+####################
+# libc management
+####################
+
+GLIBC_VERSION = "2.34"
+GLIBC_REEXEC_GUARD = "QNN_GLIBC_REEXEC"
+MINIMUM_LIBC_VERSION = GLIBC_VERSION
+
+
+def _get_glibc_libdir() -> pathlib.Path:
+    glibc_root = _get_staging_dir(f"glibc-{GLIBC_VERSION}")
+    return glibc_root / "lib"
+
+
+def _parse_version(v: str) -> tuple[int, int]:
+    """Turn '2.34' → (2,34) so it can be compared."""
+    parts = v.split(".")
+    return int(parts[0]), int(parts[1]) if len(parts) > 1 else 0
+
+
+def _current_glibc_version() -> str:
+    """Return system glibc version string (via ctypes)."""
+    try:
+        libc = ctypes.CDLL("libc.so.6")
+        func = libc.gnu_get_libc_version
+        func.restype = ctypes.c_char_p
+        return func().decode()
+    except Exception as e:
+        return f"error:{e}"
+
+
+def _resolve_glibc_loader() -> pathlib.Path | None:
+    """Return staged ld.so path if available."""
+    for p in [
+        _get_glibc_libdir() / f"ld-{GLIBC_VERSION}.so",
+        _get_glibc_libdir() / "ld-linux-x86-64.so.2",
+    ]:
+        if p.exists():
+            return p
+    return None
+
+
+def _stage_prebuilt_glibc():
+    """Download + extract Fedora 35 glibc RPM into /tmp."""
+    logger.info(">>> Staging prebuilt glibc-%s from Fedora 35 RPM", GLIBC_VERSION)
+    _get_glibc_libdir().mkdir(parents=True, exist_ok=True)
+    rpm_path = _get_staging_dir("glibc") / "glibc.rpm"
+    work_dir = _get_staging_dir("glibc") / "extracted"
+    rpm_url = (
+        "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/"
+        "Everything/x86_64/os/Packages/g/glibc-2.34-7.fc35.x86_64.rpm"
+    )
+
+    rpm_path.parent.mkdir(parents=True, exist_ok=True)
+    logger.info("[glibc] Downloading %s -> %s", rpm_url, rpm_path)
+    try:
+        urllib.request.urlretrieve(rpm_url, rpm_path)
+    except Exception as e:
+        logger.error("[glibc] Failed to download %s: %s", rpm_url, e)
+        raise
+
+    # Extract
+    if work_dir.exists():
+        shutil.rmtree(work_dir)
+    work_dir.mkdir(parents=True)
+    subprocess.check_call(["bsdtar", "-C", str(work_dir), "-xf", str(rpm_path)])
+
+    # Copy runtime libs
+    staged = [
+        "ld-linux-x86-64.so.2",
+        "libc.so.6",
+        "libdl.so.2",
+        "libpthread.so.0",
+        "librt.so.1",
+        "libm.so.6",
+        "libutil.so.1",
+    ]
+    for lib in staged:
+        src = work_dir / "lib64" / lib
+        if src.exists():
+            shutil.copy2(src, _get_glibc_libdir() / lib)
+            logger.info("[glibc] Staged %s", lib)
+        else:
+            logger.warning("[glibc] Missing %s in RPM", lib)
+
+
+def ensure_glibc_minimum(min_version: str = GLIBC_VERSION):
+    """
+    Ensure process runs under glibc >= min_version.
+    - If system glibc is new enough → skip.
+    - Else → stage Fedora RPM and re-exec under staged loader.
+    """
+    current = _current_glibc_version()
+    logger.info("[glibc] Current loaded glibc: %s", current)
+
+    # If system glibc already sufficient → skip everything
+    m = re.match(r"(\d+\.\d+)", current)
+    if m and _parse_version(m.group(1)) >= _parse_version(min_version):
+        logger.info("[glibc] System glibc >= %s, no staging needed.", min_version)
+        return
+
+    # Avoid infinite loop
+    if os.environ.get(GLIBC_REEXEC_GUARD) == "1":
+        logger.info("[glibc] Already re-exec'd once, continuing.")
+        return
+
+    # Stage prebuilt if not already staged
+    if not (_get_glibc_libdir() / "libc.so.6").exists():
+        _stage_prebuilt_glibc()
+
+    loader = _resolve_glibc_loader()
+    if not loader:
+        logger.error("[glibc] Loader not found in %s", _get_glibc_libdir())
+        return
+
+    logger.info(
+        "[glibc] Re-execing under loader %s with libdir %s", loader, _get_glibc_libdir()
+    )
+    os.environ[GLIBC_REEXEC_GUARD] = "1"
+    os.execv(
+        str(loader),
+        [str(loader), "--library-path", str(_get_glibc_libdir()), sys.executable]
+        + sys.argv,
+    )
+
+
+####################
+# libc++ management
+####################
+
 LLVM_VERSION = "14.0.0"
 LIBCXX_BASE_NAME = f"clang+llvm-{LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04"
 LLVM_URL = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/{LIBCXX_BASE_NAME}.tar.xz"
@@ -258,12 +401,17 @@ def _stage_libcxx(target_dir: pathlib.Path):
         logger.info("[libcxx] Already staged at %s, skipping download", target_dir)
         return
 
-    temp_tar = pathlib.Path("/tmp") / f"{LIBCXX_BASE_NAME}.tar.xz"
-    temp_extract = pathlib.Path("/tmp") / LIBCXX_BASE_NAME
+    libcxx_stage = _get_staging_dir(f"libcxx-{LLVM_VERSION}")
+    temp_tar = libcxx_stage / f"{LIBCXX_BASE_NAME}.tar.xz"
+    temp_extract = libcxx_stage / LIBCXX_BASE_NAME
 
     if not temp_tar.exists():
         logger.info("[libcxx] Downloading %s", LLVM_URL)
-        urllib.request.urlretrieve(LLVM_URL, temp_tar)
+        _atomic_download(LLVM_URL, temp_tar)
+
+    # Sanity check before extracting
+    if not temp_tar.exists() or temp_tar.stat().st_size == 0:
+        raise FileNotFoundError(f"[libcxx] Tarball missing or empty: {temp_tar}")
 
     logger.info("[libcxx] Extracting %s", temp_tar)
     with tarfile.open(temp_tar, "r:xz") as tar:
@@ -437,8 +585,10 @@ def install_qnn_sdk() -> bool:
     Returns:
         True if both steps succeeded (or were already satisfied), else False.
     """
-    if check_glibc_exist_and_validate():
-        if _ensure_libcxx_stack():
-            if _ensure_qnn_sdk_lib():
-                return True
-    return False
+    logger.info("[QNN] Starting SDK installation")
+
+    # Make sure we’re running under >= 2.34
+    ensure_glibc_minimum(GLIBC_VERSION)
+
+    # libc++ and QNN SDK setup
+    return _ensure_libcxx_stack() and _ensure_qnn_sdk_lib()
diff --git a/backends/qualcomm/tests/TARGETS b/backends/qualcomm/tests/TARGETS
index 639303c7eb8..d968f954485 100644
--- a/backends/qualcomm/tests/TARGETS
+++ b/backends/qualcomm/tests/TARGETS
@@ -47,3 +47,17 @@ runtime.python_library(
         ":test_qnn_delegate"
     ]
 )
+
+runtime.python_test(
+    name = "test_passes",
+    srcs = [
+        "test_passes.py",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+        "//executorch/backends/qualcomm/_passes:passes",
+        "//executorch/backends/qualcomm/builders:builders",
+    ],
+)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index a37648cb6be..5ea6caf54ad 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -4,8 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
+from typing import List, Optional, Tuple, Union
 
+import torch
 
 # module with related operator only
 
@@ -170,21 +171,23 @@ def forward(self, y):
 
 
 class Argmax(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, dim: Optional[int] = None, keepdim: bool = False):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, x):
-        x = torch.argmax(x, dim=0, keepdim=True)
-        return x
+        return torch.argmax(x, dim=self.dim, keepdim=self.keepdim)
 
 
 class Argmin(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, dim: Optional[int] = None, keepdim: bool = False):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, x):
-        x = torch.argmin(x, dim=0, keepdim=True)
-        return x
+        return torch.argmin(x, dim=self.dim, keepdim=self.keepdim)
 
 
 class ArgminViewSqueezeConv2D(torch.nn.Module):
@@ -597,28 +600,6 @@ def forward(self, x):
         return self.second(self.first(x))
 
 
-class Conv3dSequential(torch.nn.Module):
-    def __init__(self, bias=True):
-        super().__init__()
-        self.first = torch.nn.Conv3d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=(3, 3, 3),
-            padding=1,
-            bias=bias,
-        )
-        self.second = torch.nn.Conv3d(
-            in_channels=3,
-            out_channels=2,
-            kernel_size=(3, 3, 3),
-            padding=1,
-            bias=bias,
-        )
-
-    def forward(self, x):
-        return self.second(self.first(x))
-
-
 class Conv2dSingle(torch.nn.Module):
     def __init__(
         self,
@@ -725,6 +706,28 @@ def forward(self, x):
         return topk_values
 
 
+class Conv3dSequential(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.first = torch.nn.Conv3d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=bias,
+        )
+        self.second = torch.nn.Conv3d(
+            in_channels=3,
+            out_channels=2,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.second(self.first(x))
+
+
 class ConvTranspose1dSingle(torch.nn.Module):
     def __init__(self, bias=True, dilation=1):
         super().__init__()
@@ -1138,20 +1141,62 @@ def forward(self, input_pos, k_val):
 
 
 class IndexPut(torch.nn.Module):
-    def __init__(self, skip_mutable_buffer=False):
+    def __init__(self, skip_mutable_buffer=False, mode=0):
         super().__init__()
         self.skip_mutable_buffer = skip_mutable_buffer
         self.register_buffer(
             "k_cache",
-            torch.zeros((1, 1024, 12, 64), dtype=torch.float32),
+            torch.zeros((2, 1024, 12, 64), dtype=torch.float32),
             persistent=True,
         )
+        self.mode = mode
 
     def forward(self, input_pos, k_val):
-        k_out = torch.ops.aten.index_put_(self.k_cache, [None, input_pos], k_val)
+        match self.mode:
+            case 0:
+                k_out = torch.ops.aten.index_put_(self.k_cache, [input_pos], k_val)
+            case 1:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, input_pos], k_val
+                )
+            case 2:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, None, input_pos], k_val
+                )
+            case 3:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [input_pos[0], input_pos[1]], k_val
+                )
+            case 4:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, input_pos[0], input_pos[1]], k_val
+                )
+            case 5:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [input_pos[0], None, input_pos[1]], k_val
+                )
+
         return k_out + 0
 
 
+class IndexPutSuite(torch.nn.Module):
+    def __init__(self, accumulate=False, in_place=False):
+        super().__init__()
+        self.accumulate = accumulate
+        self.in_place = in_place
+
+    def forward(self, x, indices, values):
+        if self.in_place:
+            # Clone the input to avoid modifying it in-place
+            result = x.clone()
+            # Apply index_put_ and return the modified tensor
+            result.index_put_(indices, values, self.accumulate)
+            return result
+        else:
+            # Use the non-in-place variant which returns a new tensor
+            return torch.index_put(x, indices, values, self.accumulate)
+
+
 class IndexSelect(torch.nn.Module):
     def __init__(self, dim):
         super().__init__()
@@ -1332,20 +1377,20 @@ def forward(self, x):
         return self.max_pool2d(x)
 
 
-class MeanWKeppDim(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.mean(x, (-1, -2), keepdim=True)
-
-
-class MeanWOKeppDim(torch.nn.Module):
-    def __init__(self):
+class Mean(torch.nn.Module):
+    def __init__(
+        self,
+        dim: Optional[Union[int, Tuple[int, ...], List[int]]] = None,
+        keepdim: bool = False,
+        dtype: Optional[torch.dtype] = None,
+    ):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.dtype = dtype
 
     def forward(self, x):
-        return torch.mean(x, (-1, -2))
+        return torch.mean(x, dim=self.dim, keepdim=self.keepdim, dtype=self.dtype)
 
 
 class MaskedFill(torch.nn.Module):
@@ -1506,6 +1551,15 @@ def forward(self, x):
         )
 
 
+class Permute(torch.nn.Module):
+    def __init__(self, dims: List[int]):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, x):
+        return x.permute(self.dims)
+
+
 class PixelShuffle(torch.nn.Module):
     def __init__(self, scale):
         super().__init__()
@@ -1539,11 +1593,12 @@ def forward(self, x):
 
 
 class PowTensorScalar(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, exponent=2):
         super().__init__()
+        self.exponent = exponent
 
     def forward(self, x):
-        return torch.pow(x, 2)
+        return torch.pow(x, self.exponent)
 
 
 class PReLUDefault(torch.nn.Module):
@@ -2000,6 +2055,19 @@ def forward(self, x):
         return torch.tanh(x)
 
 
+class Threshold(torch.nn.Module):
+    def __init__(self, threshold=0.0, value=0.0, inplace=False):
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+
+    def forward(self, x):
+        return torch.nn.functional.threshold(
+            x, threshold=self.threshold, value=self.value, inplace=self.inplace
+        )
+
+
 class TopKandIndex(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_passes.py b/backends/qualcomm/tests/test_passes.py
new file mode 100644
index 00000000000..94a5d08acc1
--- /dev/null
+++ b/backends/qualcomm/tests/test_passes.py
@@ -0,0 +1,54 @@
+import unittest
+
+import torch
+from executorch.backends.qualcomm._passes import InsertReshapeForReduceOps
+
+
+class TestPasses(unittest.TestCase):
+    def test_insert_reshape_for_argmax(self):
+        class ArgmaxModule(torch.nn.Module):
+            def forward(self, x):
+                return torch.argmax(x, dim=None)
+
+        mod = ArgmaxModule()
+
+        x = torch.tensor([[1.0, 5.0], [3.0, 2.0]])
+        ep = torch.export.export(mod, (x,))
+        # Run original module for reference
+        ref = mod(x)
+
+        reshape_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default
+        ]
+        argmax_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default
+        ]
+        self.assertTrue(len(reshape_nodes) == 0, "Reshape node not inserted")
+        self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing")
+
+        InsertReshapeForReduceOps()(ep.graph_module)
+
+        out = ep.graph_module(x)
+
+        # Check graph structure: argmax should take a reshape as input
+        reshape_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default
+        ]
+        argmax_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default
+        ]
+        self.assertTrue(len(reshape_nodes) == 1, "Reshape node should be inserted")
+        self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing")
+
+        argmax_node = argmax_nodes[0]
+        self.assertEqual(argmax_node.args[1], 0, "Argmax dim not set to 0")
+
+        # Execute new graph and compare with reference
+        out = ep.graph_module(x)
+        self.assertTrue(
+            torch.equal(*out, ref), f"Output mismatch: got {out}, expected {ref}"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 6c444c90c08..2641acc5a2d 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import io
+import itertools
 import json
 import subprocess
 import sys
@@ -173,14 +174,64 @@ def test_qnn_backend_arange(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_argmax(self):
-        module = Argmax()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmax(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
 
     def test_qnn_backend_argmin(self):
-        module = Argmin()  # noqa: F405
-        sample_input = (torch.rand(3, 4),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmin(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
 
     @unittest.expectedFailure
     def test_qnn_backend_asin(self):
@@ -347,8 +398,8 @@ def test_qnn_backend_cumsum(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
@@ -416,8 +467,8 @@ def test_qnn_backend_element_wise_add(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_and(self):
         module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
@@ -455,8 +506,8 @@ def test_qnn_backend_element_wise_div(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_mul(self):
         test_comb = [
@@ -482,8 +533,8 @@ def test_qnn_backend_element_wise_mul(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_or(self):
         test_comb = [
@@ -557,8 +608,8 @@ def test_qnn_backend_element_wise_sub(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     @unittest.expectedFailure
     def test_qnn_backend_elu(self):
@@ -600,10 +651,10 @@ def test_qnn_backend_expand(self):
         for module in modules:
             for sample_input in sample_inputs:
                 with self.subTest(i=index):
+                    index += 1
                     self.lower_module_and_test_output(
                         module, sample_input, passes_job=passes_job
                     )
-                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -626,6 +677,21 @@ def test_qnn_backend_floor_divide(self):
             {
                 QCOM_MODULE: [FloorDiv()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [
+                    (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)),
+                    (
+                        torch.randint(-100, 100, (10, 10)).float(),
+                        torch.full((10, 10), 2.5),
+                    ),
+                    (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)),
+                    (torch.tensor([10]), torch.arange(1, 5)),  # Failed
+                    (torch.arange(-10, 10), torch.tensor([2])),
+                    (torch.randint(-100, 100, (20,)), torch.full((20,), 2)),
+                    (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)),
+                    (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)),
+                    (
+                        torch.randint(-100, 100, (2, 3, 4, 5)),
+                        torch.full((2, 3, 4, 5), 2),
+                    ),
                     (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
                     (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
@@ -641,8 +707,8 @@ def test_qnn_backend_floor_divide(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_fold(self):
         sample_input = (torch.randn(3, 512, 256),)
@@ -822,28 +888,191 @@ def test_qnn_backend_index_copy(self):
                 )
 
     def test_qnn_backend_index_put(self):
-        test_comb = [
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=False),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+        skip_mutable_buffer = [False, True]
+        total_test_combo = []
+        # mode 0
+        sample_inputs = [
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 1
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 2
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 3
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=True),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+                torch.randn([2, 12, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
+                torch.randn([1, 64]),
+            ),
         ]
-        for i, test in enumerate(test_comb):
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 4
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([2, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1, 64]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 5
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+
+        for i, test_combo in enumerate(total_test_combo):
+            for j, combo in enumerate(test_combo):
+                with self.subTest(f"mode_{i}-{j}"):
+                    self.lower_module_and_test_output(
+                        IndexPut(skip_mutable_buffer=combo[0], mode=i),  # noqa: F405
+                        combo[1],
+                        skip_mutable_buffer=combo[0],
+                    )
+
+    def test_qnn_backend_index_put_suite(self):
+        accumulate = [False, True]
+        in_place = [False, True]
+        sample_inputs = [
+            # basic
+            (
+                torch.rand(5, 2) * 100,
+                (torch.tensor([0, 2]),),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            # shape
+            (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            (
+                torch.rand(5, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (
+                torch.rand(5, 3, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            # TODO: not supported by HTP
+            # (
+            #     torch.rand(5, 3, 2, 4),
+            #     (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])),
+            #     torch.tensor([10.0]),
+            # ),
+            # indices
+            (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])),
+            (
+                torch.rand(5, 3),
+                (torch.tensor([0, 2, 4]),),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(5),
+                (torch.tensor([1, 1, 3, 3]),),
+                torch.tensor([10.0, 20.0, 30.0, 40.0]),
+            ),
+            # broadcasting
+            (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])),
+            (
+                torch.rand(3, 4),
+                (torch.tensor([0, 1]), torch.tensor([1, 2])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])),
+            (
+                torch.rand(3, 2, 2),
+                (torch.tensor([0, 1]),),
+                torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])),
+            # two-index
+            (
+                torch.rand(4, 3),
+                (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(3, 3),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([15.0, 25.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+        ]
+        test_combo = list(itertools.product(accumulate, in_place, sample_inputs))
+        for i, combo in enumerate(test_combo):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(
-                    test[QCOM_MODULE],
-                    test[QCOM_SAMPLE_INPUTS],
-                    skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
+                    IndexPutSuite(accumulate=combo[0], in_place=combo[1]),  # noqa: F405
+                    combo[2],
                 )
 
     def test_qnn_backend_index_select(self):
@@ -922,8 +1151,8 @@ def test_qnn_backend_leaky_relu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_less_equal(self):
         test_comb = [
@@ -1018,12 +1247,61 @@ def test_qnn_backend_max_pool2d(self):
         sample_input = (torch.randn(4, 3, 24, 24),)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_mean_dim(self):
-        modules = [MeanWKeppDim(), MeanWOKeppDim()]  # noqa: F405
-        sample_input = (torch.randn([2, 5, 1, 3]),)
-        for i, module in enumerate(modules):
+    def test_qnn_backend_mean(self):
+        test_comb = [
+            # Reduce over last two dims, keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Reduce over last two dims, keepdim=False
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Default: reduce all dims
+            {
+                QCOM_MODULE: Mean(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+            },
+            # TODO: To be enabled via reshape input to 1d tensor
+            # # Scalar case
+            # {
+            #     QCOM_MODULE: Mean(),
+            #     QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+            # },
+            # Edge case: dim is a empty list
+            {
+                QCOM_MODULE: Mean(dim=[]),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 (batch dimension)
+            {
+                QCOM_MODULE: Mean(dim=0),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 with keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along multiple dims
+            {
+                QCOM_MODULE: Mean(dim=(0, 2)),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+            },
+            # Edge case: high-dimensional tensor
+            {
+                QCOM_MODULE: Mean(dim=(1, 3), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+            },
+        ]
+
+        for i, test in enumerate(test_comb):
             with self.subTest(i=i):
-                self.lower_module_and_test_output(module, sample_input)
+                self.lower_module_and_test_output(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
 
     @unittest.skip("failed to lower in QNN 2.26")
     def test_qnn_backend_mha(self):
@@ -1068,6 +1346,16 @@ def test_qnn_backend_pad(self):
         sample_input = (torch.randn([1, 8, 128]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_permute(self):
+        modules = [
+            Permute([0, 2, 3, 1]),  # noqa: F405
+            Permute([-1, -3, -2, -4]),  # noqa: F405
+        ]
+        sample_input = (torch.randn([2, 3, 4, 5]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_shuffle(self):
         module = PixelShuffle(2)  # noqa: F405
         sample_input = (torch.ones([2, 4, 3, 3]),)
@@ -1079,9 +1367,28 @@ def test_qnn_backend_pixel_unshuffle(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_pow_tensor_scalar(self):
-        module = PowTensorScalar()  # noqa: F405
-        sample_input = (torch.rand([2, 4, 3, 3]),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    PowTensorScalar(),  # noqa: F405
+                    PowTensorScalar(1),  # noqa: F405
+                    PowTensorScalar(-1),  # noqa: F405
+                    PowTensorScalar(0.5),  # noqa: F405
+                ],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)],
+            },
+            {
+                QCOM_MODULE: [PowTensorScalar(10)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)],
+            },
+        ]
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_prelu(self):
         test_comb = [
@@ -1100,8 +1407,8 @@ def test_qnn_backend_prelu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_relu(self):
         module = Relu()  # noqa: F405
@@ -1216,10 +1523,8 @@ def test_qnn_backend_slice_scatter(self):
                 ],
                 QCOM_SAMPLE_INPUTS: [
                     (
-                        (
-                            torch.zeros(8, 8),
-                            torch.ones(8, 2),
-                        )
+                        torch.zeros(8, 8),
+                        torch.ones(8, 2),
                     )
                 ],
             },
@@ -1230,8 +1535,8 @@ def test_qnn_backend_slice_scatter(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_stack(self):
         module = Stack()  # noqa: F405
@@ -1274,6 +1579,17 @@ def test_qnn_backend_tanh(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_threshold(self):
+        modules = [
+            Threshold(),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=True),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_unflatten(self):
         module = Unflatten(dim=1, sizes=(2, 3, 4))  # noqa: F405
         sample_input = (torch.randn([1, 24]),)
@@ -1710,16 +2026,66 @@ def test_qnn_backend_arange(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_argmax(self):
-        module = Argmax()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmax(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_argmin(self):
-        module = Argmin()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmin(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_asin(self):
         module = Asin()  # noqa: F405
@@ -1981,9 +2347,9 @@ def test_qnn_backend_element_wise_add(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_and(self):
         module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
@@ -2022,9 +2388,9 @@ def test_qnn_backend_element_wise_div(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_mul(self):
         test_comb = [
@@ -2050,9 +2416,9 @@ def test_qnn_backend_element_wise_mul(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_or(self):
         test_comb = [
@@ -2128,9 +2494,9 @@ def test_qnn_backend_element_wise_sub(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_elu(self):
         module = Elu()  # noqa: F405
@@ -2179,11 +2545,11 @@ def test_qnn_backend_expand(self):
         for module in modules:
             for sample_input in sample_inputs:
                 with self.subTest(i=index):
+                    index += 1
                     module = self.get_qdq_module(module, sample_input)
                     self.lower_module_and_test_output(
                         module, sample_input, passes_job=passes_job
                     )
-                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -2209,6 +2575,21 @@ def test_qnn_backend_floor_divide(self):
             {
                 QCOM_MODULE: [FloorDiv()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [
+                    (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)),
+                    (
+                        torch.randint(-100, 100, (10, 10)).float(),
+                        torch.full((10, 10), 2.5),
+                    ),
+                    (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)),
+                    (torch.tensor([10]), torch.arange(1, 5)),
+                    (torch.arange(-10, 10), torch.tensor([2])),
+                    (torch.randint(-100, 100, (20,)), torch.full((20,), 2)),
+                    (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)),
+                    (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)),
+                    (
+                        torch.randint(-100, 100, (2, 3, 4, 5)),
+                        torch.full((2, 3, 4, 5), 2),
+                    ),
                     (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
                     (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
@@ -2224,9 +2605,12 @@ def test_qnn_backend_floor_divide(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        gm = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(gm, sample_input)
                         index += 1
+                        # Support int input cases with bypass_check=True
+                        gm = self.get_qdq_module(
+                            module, sample_input, bypass_check=True
+                        )
+                        self.lower_module_and_test_output(gm, sample_input)
 
     def test_qnn_backend_fold(self):
         sample_input = (torch.randn(3, 512, 256),)
@@ -2422,32 +2806,197 @@ def test_qnn_backend_index_copy(self):
                 )
 
     def test_qnn_backend_index_put(self):
-        test_comb = [
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=False),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+        skip_mutable_buffer = [False, True]
+        total_test_combo = []
+        # mode 0
+        sample_inputs = [
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 1
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 2
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 3
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=True),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+                torch.randn([2, 12, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
+                torch.randn([1, 64]),
+            ),
         ]
-        for i, test in enumerate(test_comb):
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 4
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([2, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1, 64]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 5
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+
+        for i, test_combo in enumerate(total_test_combo):
+            for j, combo in enumerate(test_combo):
+                with self.subTest(f"mode_{i}-{j}"):
+                    module = self.get_qdq_module(
+                        IndexPut(skip_mutable_buffer=combo[0], mode=i),  # noqa: F405
+                        combo[1],
+                    )
+                    self.lower_module_and_test_output(
+                        module,
+                        combo[1],
+                        skip_mutable_buffer=combo[0],
+                    )
+
+    def test_qnn_backend_index_put_suite(self):
+        accumulate = [False, True]
+        in_place = [False, True]
+        sample_inputs = [
+            # basic
+            (
+                torch.rand(5, 2) * 100,
+                (torch.tensor([0, 2]),),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            # shape
+            (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            (
+                torch.rand(5, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (
+                torch.rand(5, 3, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            # TODO: not supported by HTP
+            # (
+            #     torch.rand(5, 3, 2, 4),
+            #     (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])),
+            #     torch.tensor([10.0]),
+            # ),
+            # indices
+            (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])),
+            (
+                torch.rand(5, 3),
+                (torch.tensor([0, 2, 4]),),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(5),
+                (torch.tensor([1, 1, 3, 3]),),
+                torch.tensor([10.0, 20.0, 30.0, 40.0]),
+            ),
+            # broadcasting
+            (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])),
+            (
+                torch.rand(3, 4),
+                (torch.tensor([0, 1]), torch.tensor([1, 2])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])),
+            (
+                torch.rand(3, 2, 2),
+                (torch.tensor([0, 1]),),
+                torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])),
+            # two-index
+            (
+                torch.rand(4, 3),
+                (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(3, 3),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([15.0, 25.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+        ]
+        test_combo = list(itertools.product(accumulate, in_place, sample_inputs))
+        for i, combo in enumerate(test_combo):
             with self.subTest(i=i):
                 module = self.get_qdq_module(
-                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
-                )
-                self.lower_module_and_test_output(
-                    module,
-                    test[QCOM_SAMPLE_INPUTS],
-                    skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
+                    IndexPutSuite(accumulate=combo[0], in_place=combo[1]),  # noqa: F405
+                    combo[2],
                 )
+                self.lower_module_and_test_output(module, combo[2])
 
     def test_qnn_backend_index_select(self):
         module = IndexSelect(dim=1)  # noqa: F405
@@ -2532,9 +3081,9 @@ def test_qnn_backend_leaky_relu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_less_equal(self):
         test_comb = [
@@ -2666,13 +3215,62 @@ def test_qnn_backend_max_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_mean_dim(self):
-        modules = [MeanWKeppDim(), MeanWOKeppDim()]  # noqa: F405
-        sample_input = (torch.randn([2, 5, 1, 3]),)
-        for i, module in enumerate(modules):
+    def test_qnn_backend_mean(self):
+        test_comb = [
+            # Reduce over last two dims, keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Reduce over last two dims, keepdim=False
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Default: reduce all dims
+            {
+                QCOM_MODULE: Mean(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+            },
+            # TODO: To be enabled via reshape input to 1d tensor
+            # Scalar case
+            # {
+            #     QCOM_MODULE: Mean(),
+            #     QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+            # },
+            # Edge case: dim is a empty list
+            {
+                QCOM_MODULE: Mean(dim=[]),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 (batch dimension)
+            {
+                QCOM_MODULE: Mean(dim=0),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 with keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along multiple dims
+            {
+                QCOM_MODULE: Mean(dim=(0, 2)),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+            },
+            # Edge case: high-dimensional tensor
+            {
+                QCOM_MODULE: Mean(dim=(1, 3), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+            },
+        ]
+
+        for i, test in enumerate(test_comb):
             with self.subTest(i=i):
-                module = self.get_qdq_module(module, sample_input)
-                self.lower_module_and_test_output(module, sample_input)
+                module = self.get_qdq_module(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, test[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_mha(self):
         module = MultiheadAttention()  # noqa: F405
@@ -2722,6 +3320,17 @@ def test_qnn_backend_pad(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_permute(self):
+        modules = [
+            Permute([0, 2, 3, 1]),  # noqa: F405
+            Permute([-1, -3, -2, -4]),  # noqa: F405
+        ]
+        sample_input = (torch.randn([2, 3, 4, 5]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_shuffle(self):
         module = PixelShuffle(2)  # noqa: F405
         sample_input = (torch.ones([2, 4, 3, 3]),)
@@ -2735,10 +3344,29 @@ def test_qnn_backend_pixel_unshuffle(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_pow_tensor_scalar(self):
-        module = PowTensorScalar()  # noqa: F405
-        sample_input = (torch.rand([2, 4, 3, 3]),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    PowTensorScalar(),  # noqa: F405
+                    PowTensorScalar(1),  # noqa: F405
+                    PowTensorScalar(-1),  # noqa: F405
+                    PowTensorScalar(0.5),  # noqa: F405
+                ],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)],
+            },
+            {
+                QCOM_MODULE: [PowTensorScalar(10)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)],
+            },
+        ]
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
 
     def test_qnn_backend_prelu(self):
         test_comb = [
@@ -2757,9 +3385,9 @@ def test_qnn_backend_prelu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_relu(self):
         module = Relu()  # noqa: F405
@@ -2897,10 +3525,8 @@ def test_qnn_backend_slice_scatter(self):
                 ],
                 QCOM_SAMPLE_INPUTS: [
                     (
-                        (
-                            torch.zeros(8, 8),
-                            torch.ones(8, 2),
-                        )
+                        torch.zeros(8, 8),
+                        torch.ones(8, 2),
                     )
                 ],
             },
@@ -2911,9 +3537,9 @@ def test_qnn_backend_slice_scatter(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_softmax(self):
         modules = [Softmax(dim=1), Softmax(dim=-1)]  # noqa: F405
@@ -2963,6 +3589,18 @@ def test_qnn_backend_tanh(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_threshold(self):
+        modules = [
+            Threshold(),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=True),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                qdq_module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(qdq_module, sample_input)
+
     def test_qnn_backend_unflatten(self):
         module = Unflatten(dim=1, sizes=(2, 3, 4))  # noqa: F405
         sample_input = (torch.randn([1, 24]),)
@@ -4874,6 +5512,65 @@ def test_qnn_backend_seq_mse(self):
 
 
 class TestExampleLLMScript(TestQNN):
+    def test_static_gemma_2b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "My favourite condiment is "
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--decoder_model",
+            "gemma-2b",
+            "--model_mode",
+            "kv",
+            "--max_seq_len",
+            "1024",
+            "--eval_perplexity",
+            "--tasks",
+            "wikitext",
+            "--limit",
+            "1",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                inference_speed_ref = {"SM8650": 32, "SM8750": 36}
+                self.assertLessEqual(msg["wiki_ppl"], 35)
+                self.assertLessEqual(msg["pte_size"], 2_700_000_000)  # 2.7GB
+                if self.model in inference_speed_ref:
+                    self.assertGreaterEqual(
+                        msg["inference_speed"], inference_speed_ref[self.model]
+                    )
+
     def test_static_gemma3_1b(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
diff --git a/backends/samsung/CMakeLists.txt b/backends/samsung/CMakeLists.txt
index fff3ece5239..6ea020c0970 100644
--- a/backends/samsung/CMakeLists.txt
+++ b/backends/samsung/CMakeLists.txt
@@ -161,7 +161,7 @@ if(${ANDROID})
   install(
     TARGETS enn_backend enn_logging
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
 
diff --git a/backends/samsung/_passes/annotate_qparams.py b/backends/samsung/_passes/annotate_qparams.py
new file mode 100644
index 00000000000..663d1fdf5fa
--- /dev/null
+++ b/backends/samsung/_passes/annotate_qparams.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+from typing import Any, Dict, List, Optional
+
+import torch
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import get_buffer
+from torch.export import ExportedProgram
+from torch.fx import GraphModule, Node
+
+
+class AnnotateQparamsPass(ExportPass):
+    """This parse is to add quantize properties to node need to be quantized.
+
+    Annotate Quant params:
+        For src_node->Q->DQ->..., we will add the quant params from Q->DQ node
+         to the src_node
+
+    Annotate Requantize:
+        For src_node->Q->DQ->Q->DQ->..., if the multiple Q->DQ contains
+         different quant params, we will mark the src_node as need requantize,
+         and add Q->DQ after removing all the Q->DQs.
+    """
+
+    propagate_nodes = {
+        exir_ops.edge.aten.view_copy.default,
+        exir_ops.edge.aten.permute_copy.default,
+        exir_ops.edge.aten.squeeze_copy.default,
+        exir_ops.edge.aten.squeeze_copy.dim,
+        exir_ops.edge.aten.squeeze_copy.dims,
+        exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.concat.default,
+        exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.expand_copy.default,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def _get_last_dqs(self, node: Node) -> List[Node]:
+        r"""From one Q-DQ node, find the last DQs in the quantization node chain.
+
+
+        need to consider such case:
+                    /--Q-DQ-node1
+            node->Q->DQ--node-node2
+                    \--Q-DQ-node3
+        This is a dfs implemention, so result will keep sorted
+        Args:
+            node (Node): Search DQ from this node.
+
+        Returns:
+            List[Node]: list of DQ node by original sequence
+        """
+
+        def _impl(node: Node, res_list: List[Node]):
+            if (
+                node.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                and node.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+            ):
+                return
+            for user in node.users.keys():
+                if (
+                    user.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                    and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+                ):
+                    res_list.append(node)
+                else:
+                    _impl(user, res_list)
+
+        res_list: List[Node] = []
+        for user in node.users:
+            _impl(user, res_list)
+        return res_list
+
+    def _propagate_quant_params(self, node: Node):
+        assert (
+            quantize_attrs := node.meta.get("quantize_attrs")
+        ), "Must be annotated node."
+        requantize_map: Dict[Node, Node] = node.meta.get("requantize", {})
+        while node.users:
+            if len(node.users) != 1:
+                break
+            user = list(node.users.keys())[0]
+            if (
+                user.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+            ):
+                break
+            node = user
+        # Case1: ...-q-dq(cur)-propagate_node-node(not d-dq)
+        # Case2: propagate_node(propagateed)-propagate_node-node(not q-dq)
+        for idx, user in enumerate(node.users.keys()):
+            # For the branch who need to be requantized, we propagate the requantize params
+            user_attrs = requantize_map.get(idx, quantize_attrs)
+            if user.target not in self.propagate_nodes:
+                continue
+            if len(user.users) == 1:
+                # Possibily no need for checking len(users)>1
+                user_of_user = list(user.users)[0]
+                # node-q-dq-propagate-q-dq not need for propagatey
+                if (
+                    user_of_user.target in QuantConstants.QUANT_OPS_KEY_MAP
+                    or user_of_user.target in QuantConstants.DEQUANT_OPS_KEY_MAP
+                ):
+                    continue
+            # propagate quant for node-q-dq-propagate_node-node(not qdq)
+            user.meta["quantize_attrs"] = user_attrs
+            self._propagate_quant_params(user)
+
+    def _annotate_requantize(self, node: Node):
+        assert (
+            ori_quant_attrs := node.meta.get("quantize_attrs")
+        ), "No quant parameters found"
+        list_for_requantize = self._get_last_dqs(node)
+        node.meta["requantize"] = node.meta.get("requantize", {})
+
+        # We use index to mark the output to be requantized
+        # Because user obj and name may change when we requantize them.
+
+        def _check_same(requant_obj, ori_obj) -> bool:
+            if type(requant_obj) != type(ori_obj):  # noqa E721
+                # We need actually same type here.
+                return False
+            if not isinstance(requant_obj, torch.Tensor):
+                return requant_obj == ori_obj
+            if requant_obj.shape != ori_obj.shape:
+                return False
+            return bool((requant_obj == ori_obj).all())
+
+        requantize_map: Dict[int, Dict] = node.meta["requantize"]
+        for idx, dq in enumerate(list_for_requantize):
+            q = dq.all_input_nodes[0]
+            if q.target not in QuantConstants.QUANT_OPS_KEY_MAP:
+                continue
+            key_map = QuantConstants.DEQUANT_OPS_KEY_MAP[dq.target]
+            requantize_attrs = self.get_quant_attrs(q, key_map)
+            if not all(
+                _check_same(ori_quant_attrs[key], requantize_attrs[key])
+                for key in key_map.values()
+            ):
+                requantize_map[idx] = requantize_attrs
+
+    def _annotate(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            key_map = QuantConstants.QUANT_OPS_KEY_MAP.get(node.target, None)
+            if not key_map:
+                continue
+            source_node = node.args[0]
+            if source_node.target in (
+                *QuantConstants.QUANT_OPS_KEY_MAP,
+                *QuantConstants.DEQUANT_OPS_KEY_MAP,
+            ):
+                # Currently, don't add quant info for d_qd node here.
+                continue
+            elif source_node.target == operator.getitem:
+                source_node = source_node.args[0]
+            quant_attrs = self.get_quant_attrs(node, key_map)
+            source_node.meta["quantize_attrs"] = quant_attrs
+            self._annotate_requantize(source_node)
+            self._propagate_quant_params(source_node)
+
+    def call(self, graph_module: GraphModule):
+        self._annotate(graph_module)
+        graph_module.recompile()
+        return PassResult(graph_module, True)
+
+    def get_quant_attrs(
+        self, quant_node: torch.fx.Node, key_map: Optional[Dict] = None
+    ) -> Dict[str, Any]:
+        quant_attr_keys = [arg.name for arg in quant_node.target._schema.arguments]
+        quant_attrs = dict.fromkeys(quant_attr_keys)
+        for key, attr in zip(quant_attr_keys[1:], quant_node.args[1:]):
+            # For channel-wise quantization, params are stored by buffer nodes.
+            if isinstance(attr, torch.fx.Node):
+                attr = get_buffer(self.edge_program, attr)
+            quant_attrs[key] = attr
+        quant_attrs["target"] = quant_node.target
+        if key_map is None:
+            return quant_attrs
+        miss_attrs = []
+        for aten_attr, snc_attr in key_map.items():
+            if aten_attr not in quant_attrs:
+                miss_attrs.append(aten_attr)
+                continue
+            attr = quant_attrs[aten_attr]
+            quant_attrs.pop(aten_attr)
+            quant_attrs[snc_attr] = attr
+        assert (
+            not miss_attrs
+        ), f"Miss quant attrs {miss_attrs} for node {quant_node.name}"
+        return quant_attrs
diff --git a/backends/samsung/_passes/annotate_scalar_parameters.py b/backends/samsung/_passes/annotate_scalar_parameters.py
new file mode 100644
index 00000000000..643685bdb25
--- /dev/null
+++ b/backends/samsung/_passes/annotate_scalar_parameters.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.samsung.quantizer.quantizer import global_quant_info
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.transforms.utils import get_param_tensor, is_param_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.export import ExportedProgram
+
+
+class AnnotateScalarParametersPass(ExportPass):
+    """
+    Need to add quantization parameters for scalars for some ops
+    Ifm(Quantized)------TargetOP---
+    Scalar(Non-Quant)---/
+    Notice: Such scalars are converted to tensor node by default pass
+    """
+
+    TARGET_OPS = {
+        exir_ops.edge.aten.mul.Tensor,
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.div.Tensor,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def annotate(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.target not in self.TARGET_OPS or "quantize_attrs" not in node.meta:
+                continue
+            torch_quant_dtype = global_quant_info.weight_precison.torch_dtype
+            for input_arg in node.all_input_nodes:
+                if input_arg.op not in ("placeholder", "get_attr") or not is_param_node(
+                    self.edge_program, input_arg
+                ):
+                    continue
+                else:
+                    tensor = get_param_tensor(self.edge_program, input_arg)
+                    if not tensor.shape:
+                        qparams = {
+                            QuantConstants.QUANT_KEY.scale: float(tensor),
+                            QuantConstants.QUANT_KEY.quant_dtype: torch_quant_dtype,
+                            QuantConstants.QUANT_KEY.quant_max: torch.iinfo(
+                                torch_quant_dtype
+                            ).max,
+                            QuantConstants.QUANT_KEY.quant_min: torch.iinfo(
+                                torch_quant_dtype
+                            ).min,
+                            QuantConstants.QUANT_KEY.zero_point: 0,
+                        }
+                        input_arg.meta["quantize_attrs"] = qparams
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        self.annotate(graph_module)
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/conv1d_to_conv2d.py b/backends/samsung/_passes/conv1d_to_conv2d.py
index 57f1074b348..1b8782d956b 100644
--- a/backends/samsung/_passes/conv1d_to_conv2d.py
+++ b/backends/samsung/_passes/conv1d_to_conv2d.py
@@ -5,84 +5,93 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+from executorch.backends.transforms.utils import get_param_tensor
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch._export.utils import get_param
 
 
 class Conv1dToConv2d(ExportPass):
-
     def __init__(self, edge_program: ExportedProgram):
         super().__init__()
         self.edge_program = edge_program
 
+    def update_kernel(self, weight_node: torch.Tensor):
+        # lifted tensor in tensor constant
+        weight_3d = get_param_tensor(self.edge_program, weight_node)
+        if param_name := self.edge_program.graph_signature.inputs_to_parameters.get(
+            weight_node.name
+        ):
+            new_weight_param = torch.nn.Parameter(
+                data=weight_3d.data.contiguous().unsqueeze(dim=-1), requires_grad=False
+            )
+            self.edge_program.state_dict[param_name] = new_weight_param
+        elif tensor_name := self.edge_program.graph_signature.inputs_to_lifted_tensor_constants.get(
+            weight_node.name
+        ):
+            self.edge_program.constants[tensor_name] = torch.unsqueeze(weight_3d, -1)
+        else:
+            RuntimeError("Weight of 1d conv should be constant tensor or Parameter obj")
+        weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze(dim=-1)
+
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
         node_list = list(graph.nodes)
         for node in node_list:
-            if node.op == "call_function":
-                if node.target == exir_ops.edge.aten.convolution.default:
-                    stride = list(node.args[3])
-                    if len(stride) != 1:
-                        continue
+            if node.op != "call_function":
+                continue
+            if node.target != exir_ops.edge.aten.convolution.default:
+                continue
+            stride = list(node.args[3])
+            if len(stride) != 1:
+                continue
 
-                    # convert 3dim weight to 4dim
-                    weight_node = node.args[1]
-                    weight_3dim = get_param(self.edge_program, weight_node)
-                    weight_4dim = torch.nn.Parameter(
-                        data=weight_3dim.data.contiguous().unsqueeze(dim=-1),
-                        requires_grad=False,
-                    )
-                    parameter_name = (
-                        self.edge_program.graph_signature.inputs_to_parameters[
-                            weight_node.name
-                        ]
-                    )
-                    self.edge_program.state_dict[parameter_name] = weight_4dim
-                    weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze(
-                        dim=-1
-                    )
+            # convert 3dim weight to 4dim
+            weight_node = node.args[1]
+            self.update_kernel(weight_node)
 
-                    # Extend stride, padding, and dilation
-                    node.args = (
-                        node.args[0],
-                        node.args[1],
-                        node.args[2],
-                        node.args[3] + [1],  # stride
-                        node.args[4] + [0],  # padding
-                        node.args[5] + [1],  # dilation
-                        node.args[6],
-                        node.args[7],
-                        node.args[8],
-                    )
+            # Extend stride, padding, and dilation
+            node.args = (
+                node.args[0],
+                node.args[1],
+                node.args[2],
+                node.args[3] + [1],  # stride
+                node.args[4] + [0],  # padding
+                node.args[5] + [1],  # dilation
+                node.args[6],
+                node.args[7],
+                node.args[8],
+            )
+            # unsqueeze -> conv2d -> squeeze
 
-                    # unsqueeze -> conv2d -> squeeze
-                    with graph.inserting_before(node):
-                        input_node = node.args[0]
-                        unsqueeze_before = graph.create_node(
-                            "call_function", exir_ops.edge.aten.unsqueeze_copy.default
-                        )
-                        unsqueeze_before.args = (
-                            input_node,
-                            -1,
-                        )
-                        node.replace_input_with(input_node, unsqueeze_before)
+            with graph.inserting_before(node):
+                input_node = node.args[0]
+                prev_qparams = input_node.meta.get("quantize_attrs")
+                unsqueeze_before = graph.create_node(
+                    "call_function", exir_ops.edge.aten.unsqueeze_copy.default
+                )
+                unsqueeze_before.args = (
+                    input_node,
+                    -1,
+                )
+                node.replace_input_with(input_node, unsqueeze_before)
 
-                    with graph.inserting_after(node):
-                        squeeze_after = graph.create_node(
-                            "call_function", exir_ops.edge.aten.squeeze_copy.dims
-                        )
-                        squeeze_after.args = (
-                            node,
-                            [-1],
-                        )
-                        original_users = [
-                            user for user in node.users if user != squeeze_after
-                        ]
-                        for user in original_users:
-                            user.replace_input_with(node, squeeze_after)
+            with graph.inserting_after(node):
+                squeeze_after = graph.create_node(
+                    "call_function", exir_ops.edge.aten.squeeze_copy.dims
+                )
+                squeeze_after.args = (
+                    node,
+                    [-1],
+                )
+                original_users = [user for user in node.users if user != squeeze_after]
+                for user in original_users:
+                    user.replace_input_with(node, squeeze_after)
+            if quant_attr := node.meta.get("quantize_attrs"):
+                squeeze_after.meta["quantize_attrs"] = quant_attr
+            if prev_qparams is not None:
+                unsqueeze_before.meta["quantize_attrs"] = prev_qparams
 
         graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
+        _ = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/fold_qdq.py b/backends/samsung/_passes/fold_qdq.py
new file mode 100644
index 00000000000..c6f3699ece7
--- /dev/null
+++ b/backends/samsung/_passes/fold_qdq.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+class FoldQDQPass(ExportPass):
+    def __init__(self):
+        super().__init__()
+
+    def _fold(
+        self,
+        graph_module: GraphModule,
+    ):
+        for node in graph_module.graph.nodes:
+            if node.target not in (
+                *QuantConstants.QUANT_OPS_KEY_MAP.keys(),
+                *QuantConstants.DEQUANT_OPS_KEY_MAP.keys(),
+            ):
+                continue
+            for user in [user for user in node.users.keys()]:  # noqa: C416
+                user.replace_input_with(node, node.args[0])
+            graph_module.graph.erase_node(node)
+
+    def call(self, graph_module: GraphModule):
+        self._fold(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/fuse_conv_act.py b/backends/samsung/_passes/fuse_conv_act.py
new file mode 100644
index 00000000000..c034c98bb14
--- /dev/null
+++ b/backends/samsung/_passes/fuse_conv_act.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+def map_hardtan_relux(tanhnode: torch.fx.node.Node) -> Optional[str]:
+    assert (
+        tanhnode.target == exir_ops.edge.aten.hardtanh.default
+    ), "Must be a hardtanh node"
+    if not tanhnode.args[1] == 0.0:
+        return None
+    if tanhnode.args[2] == 6.0:
+        return "RELU6"
+    return None
+
+
+class FuseConvActPass(ExportPass):
+    TARGET_ACTS_MAP = {
+        exir_ops.edge.aten.relu.default: (lambda x: "RELU"),
+        exir_ops.edge.aten.relu_.default: (lambda x: "RELU"),
+        exir_ops.edge.aten.relu6.default: (lambda x: "RELU6"),
+        exir_ops.edge.aten.relu6_.default: (lambda x: "RELU6"),
+        exir_ops.edge.aten.hardtanh.default: map_hardtan_relux,
+        exir_ops.edge.aten.hardtanh_.default: map_hardtan_relux,
+    }
+
+    def _fuse(
+        self,
+        graph_module: GraphModule,
+    ):
+        for target_conv, target_act in self.get_target_conv_act(graph_module):
+            assert (
+                act_name := self.TARGET_ACTS_MAP.get(target_act.target)(target_act)
+            ), f"Not supported {target_act.name} now."
+            target_conv.meta["activation"] = act_name
+            if "quantize_attrs" in target_act.meta:
+                target_conv.meta["quantize_attrs"] = target_act.meta["quantize_attrs"]
+
+            # If we merge the real out activation to conv, the conv should be the real out
+            if "real_out" in target_act.meta:
+                target_conv.meta["real_out"] = target_act.meta["real_out"]
+            for user in [user for user in target_act.users.keys()]:  # noqa: C416
+                user.replace_input_with(target_act, target_conv)
+            graph_module.graph.erase_node(target_act)
+
+    def get_target_conv_act(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.target != exir_ops.edge.aten.convolution.default:
+                continue
+            if len(node.users) != 1:
+                # Such cases couldn't be conv + act
+                continue
+            act_node = list(node.users.keys())[0]
+            if act_node.target not in self.TARGET_ACTS_MAP:
+                continue
+            if "quantize_attrs" in node.meta:
+                # If the conv's output is quantized
+                # We do not fuse them
+                continue
+            yield node, act_node
+
+    def call(self, graph_module: GraphModule):
+        self._fuse(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/insert_qdq.py b/backends/samsung/_passes/insert_qdq.py
new file mode 100644
index 00000000000..a59b011ac4b
--- /dev/null
+++ b/backends/samsung/_passes/insert_qdq.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Any, Dict
+
+import torch
+from executorch.backends.samsung._passes.utils import none_quant_tensor_quant_meta
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
+
+
+class QType(Enum):
+    Quant = 0
+    Dequant = 1
+
+
+class InsertQDQPass(ExportPass):
+    QDQ_MAP = {
+        # per tensor
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+        # per channel
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default: exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def _create_qdq_node(
+        self,
+        graph_module: GraphModule,
+        qtype: QType,
+        input_node: torch.fx.Node,
+        quant_attrs: Dict[str, Any],
+    ) -> torch.fx.Node:
+        assert (target := quant_attrs.get("target")), ""
+        new_node_args = [input_node]
+        new_node_meta_val = input_node.meta["val"]
+        new_node_quant_attrs = {}
+        if qtype == QType.Dequant:
+            target = self.QDQ_MAP[target]
+        else:
+            # For input node, we should set the val type as quant type
+            key = QuantConstants.QUANT_KEY.quant_dtype
+            new_node_meta_val = new_node_meta_val.to(quant_attrs[key])
+            new_node_quant_attrs.update(quant_attrs)
+
+        for arg in target._schema.arguments[1:]:
+            name = arg.name
+            if name == "out_dtype":
+                continue
+            if qtype == QType.Quant:
+                key = QuantConstants.QUANT_OPS_KEY_MAP[target].get(name, name)
+            else:
+                key = QuantConstants.DEQUANT_OPS_KEY_MAP[target].get(name, name)
+            arg_value = quant_attrs[key]
+            if isinstance(arg.type, torch.Tensor) and (
+                isinstance(arg_value, int) or isinstance(arg_value, float)
+            ):
+                arg_value = torch.Tensor(arg_value)
+            new_node_args.append(arg_value)
+
+        new_node = graph_module.graph.create_node(
+            "call_function", target, tuple(new_node_args)
+        )
+        if new_node_quant_attrs:
+            new_node.meta["quantize_attrs"] = new_node_quant_attrs
+        else:
+            new_node.meta["quantize_attrs"] = {
+                QuantConstants.QUANT_KEY.quant_dtype: torch.float32,
+                QuantConstants.QUANT_KEY.scale: [1.0],
+                QuantConstants.QUANT_KEY.zero_point: [0],
+            }
+        new_node.meta["val"] = new_node_meta_val
+        return new_node
+
+    def _add_dq_after(self, graph_module: GraphModule, node: torch.fx.Node):
+        if not (quant_attrs := node.meta.get("quantize_attrs")):
+            return
+        with graph_module.graph.inserting_after(node):
+            new_node = self._create_qdq_node(
+                graph_module, QType.Dequant, node, quant_attrs
+            )
+            users = [user for user in node.users.keys() if (user.op == "output")]
+            for user in users:
+                user.replace_input_with(node, new_node)
+
+    def _add_q_after(self, graph_module: GraphModule, node: torch.fx.Node):
+        # In node don't need quant attrs after insert new quantize node.
+        if not (quant_attrs := node.meta.pop("quantize_attrs", None)):
+            return
+        node.meta["quantize_attrs"] = none_quant_tensor_quant_meta()
+        with graph_module.graph.inserting_after(node):
+            users = list(node.users.keys())
+            new_node = self._create_qdq_node(
+                graph_module, QType.Quant, node, quant_attrs
+            )
+            for user in users:
+                if user.target not in QuantConstants.QUANT_OPS_KEY_MAP:
+                    user.replace_input_with(node, new_node)
+
+    def _add_q_before(
+        self,
+        graph_module: GraphModule,
+        node: torch.fx.Node,
+        from_node: torch.fx.Node,
+        quantize_attrs: Dict,
+    ):
+        with graph_module.graph.inserting_before(node):
+            new_quant_node = self._create_qdq_node(
+                graph_module, QType.Quant, from_node, quantize_attrs
+            )
+            node.replace_input_with(from_node, new_quant_node)
+        return new_quant_node
+
+    def _add_dq_before(
+        self,
+        graph_module: GraphModule,
+        node: torch.fx.Node,
+        from_node: torch.fx.Node,
+        quantize_attrs: Dict,
+    ):
+        with graph_module.graph.inserting_before(node):
+            new_dequant_node = self._create_qdq_node(
+                graph_module, QType.Dequant, from_node, quantize_attrs
+            )
+            node.replace_input_with(from_node, new_dequant_node)
+        return new_dequant_node
+
+    def _add_qdq_for_requantize(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            requant_map: Dict[int, Dict] = node.meta.get("requantize")
+            if requant_map is None:
+                continue
+            assert (ori_quant_attrs := node.meta.get("quantize_attrs"))
+            usr_list = list(node.users.keys())
+            for user_idx, requant_params in requant_map.items():
+                user = usr_list[user_idx]
+                q_node = self._add_q_before(graph_module, user, node, requant_params)
+                _ = self._add_dq_before(graph_module, q_node, node, ori_quant_attrs)
+
+    def _add_qdq(self, graph_module: GraphModule):
+        for node in list(graph_module.graph.nodes):
+            if is_graph_input(self.edge_program, node):
+                self._add_q_after(graph_module, node)
+            elif is_graph_output(node):
+                self._add_dq_after(graph_module, node)
+
+    def call(self, graph_module: GraphModule):
+        self._add_qdq(graph_module)
+        self._add_qdq_for_requantize(graph_module)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/remove_useless_ops.py b/backends/samsung/_passes/remove_useless_ops.py
new file mode 100644
index 00000000000..c88a2d4a5d8
--- /dev/null
+++ b/backends/samsung/_passes/remove_useless_ops.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+class RemoveUselessOpPass(ExportPass):
+    # such ops should be single-in and single-out
+    USELESS_OP_SET = {
+        exir_ops.edge.aten._to_copy.default,
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.alias.default,
+        exir_ops.edge.aten.lift_fresh_copy.default,
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def gen_pattern_as_strided_copy(self, graph_module: GraphModule):
+        for node in list(graph_module.graph.nodes):  # noqa: C416
+            if node.target != exir_ops.edge.aten.mean.dim:
+                continue
+            if len(node.users) != 1:
+                continue
+            successor = list(node.users.keys())[0]
+            if successor.target != exir_ops.edge.aten.as_strided_copy.default:
+                continue
+            is_pattern = True
+            count = 0
+            for i, stride in enumerate(successor.args[2]):
+                if stride < node.meta["val"].size()[i]:
+                    if stride == 1:
+                        count += 1
+                    else:
+                        is_pattern = False
+                        break
+                if count >= 2:
+                    is_pattern = False
+                    break
+            if is_pattern:
+                yield successor
+
+    def _fold_as_strided_copy(
+        self,
+        graph_module: GraphModule,
+    ):
+        for as_strided_copy_node in self.gen_pattern_as_strided_copy(graph_module):
+            for user in list(as_strided_copy_node.users.keys()):
+                user.replace_input_with(
+                    as_strided_copy_node, as_strided_copy_node.args[0]
+                )
+            graph_module.graph.erase_node(as_strided_copy_node)
+
+    def _remove_useless(
+        self,
+        graph_module: GraphModule,
+    ):
+        for node in graph_module.graph.nodes:
+            if node.target not in self.USELESS_OP_SET:
+                continue
+
+            # Prevent from removing if data type may change.
+            if (
+                node.target == exir_ops.edge.aten._to_copy.default
+                or node.target == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+            ) and "memory_format" not in node.kwargs:
+                continue
+
+            for user in [user for user in node.users.keys()]:  # noqa: C416
+                user.replace_input_with(node, node.all_input_nodes[0])
+            graph_module.graph.erase_node(node)
+        self._fold_as_strided_copy(graph_module)
+
+    def call(self, graph_module: GraphModule):
+        self._remove_useless(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/utils.py b/backends/samsung/_passes/utils.py
new file mode 100644
index 00000000000..afa7c72c601
--- /dev/null
+++ b/backends/samsung/_passes/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def none_quant_tensor_quant_meta():
+    return {
+        "quant_dtype": torch.float32,
+        "scales": 1,
+        "zero_points": 0,
+    }
diff --git a/backends/samsung/build.sh b/backends/samsung/build.sh
index dfa6407ff50..4845c760f0c 100755
--- a/backends/samsung/build.sh
+++ b/backends/samsung/build.sh
@@ -45,6 +45,7 @@ function build_x86_64() {
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
 	      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -S ${PROJECT_DIR} \
         -B ${X86_64_BUILD_DIR}
@@ -77,6 +78,7 @@ function build_android() {
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
 	      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_LOGGING=1 \
         -DEXECUTORCH_BUILD_DEVTOOLS=ON \
diff --git a/backends/samsung/builders/__init__.py b/backends/samsung/builders/__init__.py
index 02a457fd06e..978da82b370 100644
--- a/backends/samsung/builders/__init__.py
+++ b/backends/samsung/builders/__init__.py
@@ -14,11 +14,13 @@
     op_clamp,
     op_constant_pad_nd,
     op_conv2d,
+    op_dequantize,
     op_div,
     op_embedding,
     op_expand_copy,
     op_gelu,
     op_getitem,
+    op_hardsigmoid,
     op_hardswish,
     op_hardtanh,
     op_layer_norm,
@@ -32,6 +34,7 @@
     op_mul,
     op_permute,
     op_pixel_shuffle,
+    op_quantize,
     op_relu,
     op_reshape,
     op_rsqrt,
@@ -57,6 +60,7 @@
     op_clamp,
     op_conv2d,
     op_constant_pad_nd,
+    op_dequantize,
     op_div,
     op_embedding,
     op_expand_copy,
@@ -64,6 +68,7 @@
     op_getitem,
     op_hardswish,
     op_hardtanh,
+    op_hardsigmoid,
     op_layer_norm,
     op_leaky_relu,
     op_linear,
@@ -75,6 +80,7 @@
     op_mul,
     op_permute,
     op_pixel_shuffle,
+    op_quantize,
     op_relu,
     op_reshape,
     op_rsqrt,
diff --git a/backends/samsung/builders/node_visitor.py b/backends/samsung/builders/node_visitor.py
index a35c0b4715d..0d2707da8f5 100644
--- a/backends/samsung/builders/node_visitor.py
+++ b/backends/samsung/builders/node_visitor.py
@@ -14,6 +14,7 @@
     get_tensor_type,
 )
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+from executorch.backends.samsung.utils.constants import QuantConstants
 from executorch.backends.transforms.utils import is_param_node
 from torch.export import ExportedProgram
 
@@ -61,18 +62,26 @@ def define_tensor(
 
         dims = [1] if len(tensor.size()) == 0 else list(tensor.size())
 
+        quant_attrs = node.meta.get("quantize_attrs")
         enn_tensor_id = enn_graph.define_tensor(
             node.name,
             dims,
             data_type,
             tensor_type.name,
             const_data,
+            quant_param=quant_attrs,
         )
         assert enn_tensor_id is not None
         vals_to_ids[node] = enn_tensor_id
 
         return enn_tensor_id
 
+    def _update_params_qdtype(self, node: torch.fx.Node, params: Dict):
+        if qdtype := node.meta.get("quantize_attrs", {}).get(
+            QuantConstants.QUANT_KEY.quant_dtype
+        ):
+            params["quant_dtype"] = EnnGraph._affine_meta_param(qdtype)
+
 
 _node_visitor_dict = {}
 
@@ -92,6 +101,7 @@ def register_node_visitor(visitor):
         raise TypeError(
             f"target of vistor should be str|Tuple[str]|List[str], not{type(visitor.target)}"
         )
+    return visitor
 
 
 def get_node_visitors(*args) -> Dict[str, NodeVisitor]:
diff --git a/backends/samsung/builders/op_add.py b/backends/samsung/builders/op_add.py
index 1b0dddb0d02..a6eb79897dd 100644
--- a/backends/samsung/builders/op_add.py
+++ b/backends/samsung/builders/op_add.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -28,9 +29,13 @@ def define_node(
     ) -> None:
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTSUM", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTSUM", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_avg_pool2d.py b/backends/samsung/builders/op_avg_pool2d.py
index ad7ccbac3ae..bfca8b89b22 100644
--- a/backends/samsung/builders/op_avg_pool2d.py
+++ b/backends/samsung/builders/op_avg_pool2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -49,6 +50,7 @@ def define_node(
         params["stride_w"] = stride[1]
         params["padding"] = "EXPLICIT"
         params["explicit_padding"] = explicit_padding
+        self._update_params_qdtype(node, params)
 
         if len(node.args) > 4:
             ceil_mode = cast(bool, node.args[4])
@@ -64,7 +66,5 @@ def define_node(
             assert (
                 divisor_override == kernel_size[0] * kernel_size[1]
             ), "Not supported divisor_override which is not equal to pooling region."
-
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
         enn_graph.define_op(node.name, "AVGPOOL2D", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_bmm.py b/backends/samsung/builders/op_bmm.py
index 6ba8864ebb3..13e0d19cb14 100644
--- a/backends/samsung/builders/op_bmm.py
+++ b/backends/samsung/builders/op_bmm.py
@@ -16,7 +16,7 @@
 
 @register_node_visitor
 class BMMVisitor(NodeVisitor):
-    target = "aten.bmm.default"
+    target = ["aten.bmm.default"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -29,12 +29,15 @@ def define_node(
     ) -> None:
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
 
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
+        params = {}
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(
-            node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id]
+            node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id], params
         )
diff --git a/backends/samsung/builders/op_cat.py b/backends/samsung/builders/op_cat.py
index e9c0a32b389..09387f2e361 100644
--- a/backends/samsung/builders/op_cat.py
+++ b/backends/samsung/builders/op_cat.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -12,6 +13,7 @@
 )
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
 from executorch.backends.transforms import get_shape
+from executorch.backends.transforms.utils import is_param_node
 
 
 @register_node_visitor
@@ -29,14 +31,20 @@ def define_node(
     ) -> None:
         tensors = cast(List[torch.fx.Node], node.args[0])
         input_tensor_ids = []
-
-        for in_tensor in tensors:
+        constant_idx = None
+        for idx, in_tensor in enumerate(tensors):
+            if is_param_node(self.exported_program, in_tensor):
+                assert constant_idx is None, "Only support at most 1 constant tensor"
+                constant_idx = idx
             input_id = self.define_tensor(in_tensor, enn_graph, vals_to_ids)
             input_tensor_ids.append(input_id)
 
         in_shape = get_shape(node)
         axis = cast(int, node.args[1]) % len(in_shape) if len(node.args) >= 2 else 0
         params = {"axis": axis}
+        if constant_idx is not None:
+            params["constant_index"] = constant_idx
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
         enn_graph.define_op(node.name, "CONCAT", input_tensor_ids, [output_id], params)
diff --git a/backends/samsung/builders/op_clamp.py b/backends/samsung/builders/op_clamp.py
index c5670b80fa3..74af83212a5 100644
--- a/backends/samsung/builders/op_clamp.py
+++ b/backends/samsung/builders/op_clamp.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict
 
 import torch
@@ -32,12 +33,15 @@ def define_node(
         # The default value of lower bound and upper bound
         output_min = torch.finfo(torch.float32).min
         output_max = torch.finfo(torch.float32).max
+
         if node.args[1] is not None:
             output_min = cast(float, node.args[1])
         if len(node.args) > 2 and node.args[2] is not None:
             output_max = cast(float, node.args[2])
 
         params = {"minimum": output_min, "maximum": output_max}
+        self._update_params_qdtype(node, params)
+
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
         enn_graph.define_op(node.name, "CLIP", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_conv2d.py b/backends/samsung/builders/op_conv2d.py
index 881a533801f..ab77d8df626 100644
--- a/backends/samsung/builders/op_conv2d.py
+++ b/backends/samsung/builders/op_conv2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -56,6 +57,9 @@ def define_node(
         input_shape = get_shape(input)
         kernel_shape = get_shape(weight_node)
         params = {}
+        self._update_params_qdtype(node, params)
+        if "activation" in node.meta:
+            params["activation"] = node.meta["activation"]
         params["kernel_h"] = kernel_shape[2]
         params["kernel_w"] = kernel_shape[3]
         params["stride_h"] = stride[0]
diff --git a/backends/samsung/builders/op_dequantize.py b/backends/samsung/builders/op_dequantize.py
new file mode 100644
index 00000000000..a1c31af4037
--- /dev/null
+++ b/backends/samsung/builders/op_dequantize.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.samsung.builders.node_visitor import register_node_visitor
+from executorch.backends.samsung.builders.op_quantize import _QuantOpVistorBase
+
+
+# Dequant ops here
+@register_node_visitor
+class DequantizeVistor(_QuantOpVistorBase):
+    target = [
+        "quantized_decomposed.dequantize_per_tensor.default",
+        "quantized_decomposed.dequantize_per_tensor.tensor",
+        "quantized_decomposed.dequantize_per_channel.default",
+        "quantized_decomposed.dequantize_per_channel.tensor",
+    ]
diff --git a/backends/samsung/builders/op_div.py b/backends/samsung/builders/op_div.py
index 89d773ddb0e..8b0e7cdd5af 100644
--- a/backends/samsung/builders/op_div.py
+++ b/backends/samsung/builders/op_div.py
@@ -27,13 +27,16 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
-        # inputs
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
-
+        params = {}
+        self._update_params_qdtype(node, params)
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTDIV", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTDIV", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_gelu.py b/backends/samsung/builders/op_gelu.py
index 059a3b77850..88417f688f9 100644
--- a/backends/samsung/builders/op_gelu.py
+++ b/backends/samsung/builders/op_gelu.py
@@ -27,8 +27,14 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
-        input_id = self.define_tensor(node.args[0], enn_graph, vals_to_ids)
+        # input1
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
+        # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "GELU", [input_id], [output_id])
+        params = {}
+        self._update_params_qdtype(node, params)
+
+        enn_graph.define_op(node.name, "GELU", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardsigmoid.py b/backends/samsung/builders/op_hardsigmoid.py
new file mode 100644
index 00000000000..3a50d65da41
--- /dev/null
+++ b/backends/samsung/builders/op_hardsigmoid.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.samsung.builders.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+
+
+@register_node_visitor
+class HardSigmoidVisitor(NodeVisitor):
+    target = "aten.hardsigmoid.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        enn_graph: EnnGraph,
+        vals_to_ids: Dict[torch.Tensor, int],
+    ) -> None:
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
+        output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
+        enn_graph.define_op(node.name, "HardSigmoid", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardswish.py b/backends/samsung/builders/op_hardswish.py
index 72a99d17b83..8c30125e8a4 100644
--- a/backends/samsung/builders/op_hardswish.py
+++ b/backends/samsung/builders/op_hardswish.py
@@ -29,7 +29,7 @@ def define_node(
     ) -> None:
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
-
+        params = {}
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
-        enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id])
+        enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardtanh.py b/backends/samsung/builders/op_hardtanh.py
index 4f667bf5299..7d65e97a566 100644
--- a/backends/samsung/builders/op_hardtanh.py
+++ b/backends/samsung/builders/op_hardtanh.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict
 
 import torch
@@ -29,9 +30,12 @@ def define_node(
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
+        # default value of output_min and output_max
         output_min = cast(float, node.args[1]) if len(node.args) > 1 else -1
         output_max = cast(float, node.args[2]) if len(node.args) > 2 else 1
+
         params = {"minimum": output_min, "maximum": output_max}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
diff --git a/backends/samsung/builders/op_layer_norm.py b/backends/samsung/builders/op_layer_norm.py
index e6f853178d8..098bc92dc84 100644
--- a/backends/samsung/builders/op_layer_norm.py
+++ b/backends/samsung/builders/op_layer_norm.py
@@ -46,9 +46,8 @@ def define_node(
 
         epsilon = node.args[4] if len(node.args) > 4 else 1e-5
         params = {"epsilon": epsilon}
-
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
         enn_graph.define_op(
             node.name, "LAYERNORM", all_input_tensors, [output_id], params
         )
diff --git a/backends/samsung/builders/op_linear.py b/backends/samsung/builders/op_linear.py
index 2f7aa1e6415..720439de976 100644
--- a/backends/samsung/builders/op_linear.py
+++ b/backends/samsung/builders/op_linear.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -43,6 +44,7 @@ def define_node(
 
         weight_shape = get_shape(weight_node)
         params = {"in_channels": weight_shape[1], "out_channels": weight_shape[0]}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
diff --git a/backends/samsung/builders/op_max_pool2d.py b/backends/samsung/builders/op_max_pool2d.py
index d386dd30b1a..57b716fcb34 100644
--- a/backends/samsung/builders/op_max_pool2d.py
+++ b/backends/samsung/builders/op_max_pool2d.py
@@ -73,6 +73,7 @@ def define_node(
         params["explicit_padding"] = explicit_padding
         params["dilation_h"] = dilation[0]
         params["dilation_w"] = dilation[1]
+        self._update_params_qdtype(node, params)
 
         if len(node.args) > 5:
             ceil_mode = cast(bool, node.args[5])
diff --git a/backends/samsung/builders/op_mean_dim.py b/backends/samsung/builders/op_mean_dim.py
index 2f07f870ec4..3d0377703a7 100644
--- a/backends/samsung/builders/op_mean_dim.py
+++ b/backends/samsung/builders/op_mean_dim.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -27,6 +28,7 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
+        # input
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
@@ -37,8 +39,11 @@ def define_node(
         in_shape = get_shape(input)
         for dim in dims:
             reduce_axes.append(dim % len(in_shape))
-        reduce_axes.sort()
+
+        if len(node.args[1]) > 1:
+            reduce_axes.sort()
 
         keep_dim = node.args[2] if len(node.args) >= 3 else False
         params = {"keep_dims": keep_dim, "axis": reduce_axes}
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(node.name, "REDUCEMEAN", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_mul.py b/backends/samsung/builders/op_mul.py
index dce531ff0b0..6dd7c0dd9f0 100644
--- a/backends/samsung/builders/op_mul.py
+++ b/backends/samsung/builders/op_mul.py
@@ -1,5 +1,9 @@
-# Copyright (c) 2024 Samsung Electronics Co. LTD
+# Copyright (c) 2025 Samsung Electronics Co. LTD
 # All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -23,11 +27,17 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
+
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTMUL", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTMUL", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_quantize.py b/backends/samsung/builders/op_quantize.py
new file mode 100644
index 00000000000..dcf30e291f9
--- /dev/null
+++ b/backends/samsung/builders/op_quantize.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.samsung.builders.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+from executorch.backends.samsung.utils.constants import QuantConstants
+
+
+class _QuantOpVistorBase(NodeVisitor):
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        enn_graph: EnnGraph,
+        vals_to_ids: Dict[torch.Tensor, int],
+    ) -> None:
+        # input
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
+
+        scales = node.args[1]
+        if isinstance(scales, torch.Tensor):
+            scales = scales.tolist()
+        elif not isinstance(scales, list):
+            scales = torch.tensor(scales).reshape([1]).tolist()
+        zero_points = node.args[2]
+        if isinstance(zero_points, torch.Tensor):
+            zero_points = zero_points.tolist()
+        elif not isinstance(zero_points, list):
+            zero_points = torch.tensor(zero_points).reshape([1]).tolist()
+
+        output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+
+        params = {"scales": scales, "zero_points": zero_points}
+
+        if node.target in QuantConstants.QUANT_OPS_KEY_MAP:
+            enn_graph.define_op(node.name, "QUANTIZE", [input_id], [output_id], params)
+        else:
+            enn_graph.define_op(
+                node.name, "DEQUANTIZE", [input_id], [output_id], params
+            )
+
+
+@register_node_visitor
+class QuantizeVistor(_QuantOpVistorBase):
+    target = [
+        "quantized_decomposed.quantize_per_tensor.default",
+        "quantized_decomposed.quantize_per_channel.default",
+    ]
diff --git a/backends/samsung/builders/op_relu.py b/backends/samsung/builders/op_relu.py
index ba90116be1d..a4a2b6bc4f0 100644
--- a/backends/samsung/builders/op_relu.py
+++ b/backends/samsung/builders/op_relu.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -30,5 +31,7 @@ def define_node(
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
 
-        enn_graph.define_op(node.name, "RELU", [input_id], [output_id])
+        enn_graph.define_op(node.name, "RELU", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_softmax.py b/backends/samsung/builders/op_softmax.py
index 1e2e4a378dc..7f569cea6fc 100644
--- a/backends/samsung/builders/op_softmax.py
+++ b/backends/samsung/builders/op_softmax.py
@@ -35,5 +35,5 @@ def define_node(
 
         axis = cast(int, node.args[1])
         params = {"axis": axis}
-
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(node.name, "SOFTMAX", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_squeeze.py b/backends/samsung/builders/op_squeeze.py
index d165a22fcb3..82fa17fbc95 100644
--- a/backends/samsung/builders/op_squeeze.py
+++ b/backends/samsung/builders/op_squeeze.py
@@ -33,4 +33,5 @@ def define_node(
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id])
+        params = {"new_shape": [*node.meta["val"].shape]}
+        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_to_copy.py b/backends/samsung/builders/op_to_copy.py
index 545672ef6a3..c770602bb5f 100644
--- a/backends/samsung/builders/op_to_copy.py
+++ b/backends/samsung/builders/op_to_copy.py
@@ -11,6 +11,8 @@
     NodeVisitor,
     register_node_visitor,
 )
+
+from executorch.backends.samsung.builders.utils import get_map_dtype, get_tensor
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
 
 
@@ -35,5 +37,8 @@ def define_node(
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        out_tensor = get_tensor(self.exported_program, node)
+        params["out_dtype"] = get_map_dtype(out_tensor.dtype)
 
-        enn_graph.define_op(node.name, "CAST", [input_id], [output_id])
+        enn_graph.define_op(node.name, "CAST", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_unsqueeze.py b/backends/samsung/builders/op_unsqueeze.py
index 942c3307de7..61fa06e6310 100644
--- a/backends/samsung/builders/op_unsqueeze.py
+++ b/backends/samsung/builders/op_unsqueeze.py
@@ -31,4 +31,5 @@ def define_node(
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id])
+        params = {"new_shape": [*node.meta["val"].shape]}
+        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_upsample_bilinear2d.py b/backends/samsung/builders/op_upsample_bilinear2d.py
index a934b2789ba..d4b040460e3 100644
--- a/backends/samsung/builders/op_upsample_bilinear2d.py
+++ b/backends/samsung/builders/op_upsample_bilinear2d.py
@@ -46,6 +46,7 @@ def define_node(
             "upsampling_factor": scale_factor,
             "half_pixel_centers": True,
         }
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
         enn_graph.define_op(
             node.name, "RESIZE_BILINEAR", [input_id], [output_id], params
diff --git a/backends/samsung/builders/utils.py b/backends/samsung/builders/utils.py
index 58c84ff6d31..a640071c798 100644
--- a/backends/samsung/builders/utils.py
+++ b/backends/samsung/builders/utils.py
@@ -9,7 +9,6 @@
 import torch
 from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output
 from executorch.backends.transforms.utils import get_param_tensor, is_param_node
-
 from torch.export import ExportedProgram
 
 DATA_TYPE_STR_MAPPING = {
diff --git a/backends/samsung/enn_preprocess.py b/backends/samsung/enn_preprocess.py
index dde01bc09c7..0847ec0adeb 100644
--- a/backends/samsung/enn_preprocess.py
+++ b/backends/samsung/enn_preprocess.py
@@ -9,10 +9,16 @@
 
 import executorch.backends.samsung.python.PyEnnWrapperAdaptor as PyEnnWrapper
 import torch
+from executorch.backends.samsung._passes.annotate_qparams import AnnotateQparamsPass
+from executorch.backends.samsung._passes.annotate_scalar_parameters import (
+    AnnotateScalarParametersPass,
+)
 from executorch.backends.samsung._passes.conv1d_to_conv2d import Conv1dToConv2d
 from executorch.backends.samsung._passes.customized_constant_prop import (
     ConstantPropPass,
 )
+from executorch.backends.samsung._passes.fold_qdq import FoldQDQPass
+from executorch.backends.samsung._passes.insert_qdq import InsertQDQPass
 from executorch.backends.samsung._passes.replace_scalar_ops import ReplaceOpsWithScalar
 from executorch.backends.samsung.builders.node_visitor import get_node_visitors
 from executorch.backends.samsung.serialization.compile_options import (
@@ -53,12 +59,16 @@ def preprocess(
 
         enn_preprocess_passes = PassManager(
             passes=[
+                AnnotateQparamsPass(edge_program),
+                FoldQDQPass(),
                 ConstantPropPass(edge_program),
                 Conv1dToConv2d(edge_program),
                 FuseBatchNormWithConvPass(edge_program),
                 AddmmToLinearTransform(),
                 ReplaceOpsWithScalar(),
                 RemoveGetItemPass(),
+                InsertQDQPass(edge_program),
+                AnnotateScalarParametersPass(edge_program),
             ]
         )
         pass_result = enn_preprocess_passes(edge_program.graph_module)
diff --git a/backends/samsung/partition/enn_partitioner.py b/backends/samsung/partition/enn_partitioner.py
index 952cb000429..368d069c380 100644
--- a/backends/samsung/partition/enn_partitioner.py
+++ b/backends/samsung/partition/enn_partitioner.py
@@ -129,5 +129,6 @@ def ops_to_not_decompose(
             torch.ops.aten.prelu.default,
             torch.ops.aten.layer_norm.default,
             torch.ops.aten.pixel_shuffle.default,
+            torch.ops.aten.hardsigmoid.default,
         ]
         return (ops_not_to_decompose, None)
diff --git a/backends/samsung/quantizer/__init__.py b/backends/samsung/quantizer/__init__.py
new file mode 100644
index 00000000000..621eec69240
--- /dev/null
+++ b/backends/samsung/quantizer/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .qconfig import Precision
+from .quantizer import EnnQuantizer
+
+__all__ = [EnnQuantizer, Precision]
diff --git a/backends/samsung/quantizer/annotator.py b/backends/samsung/quantizer/annotator.py
new file mode 100644
index 00000000000..31015698006
--- /dev/null
+++ b/backends/samsung/quantizer/annotator.py
@@ -0,0 +1,871 @@
+# Copyright (c) Qualcomm Innovation Center, Inc
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Dict, List
+
+import torch
+from torch._ops import OpOverload
+from torch._subclasses import FakeTensor
+
+from torch.fx import Graph, Node
+
+from torchao.quantization.pt2e import FixedQParamsObserver
+from torchao.quantization.pt2e.quantizer import (
+    annotate_output_qspec,
+    QuantizationAnnotation,
+    QuantizationSpec,
+    SharedQuantizationSpec,
+)
+
+from .qconfig import QuantizationConfig
+
+OP_ANNOTATOR: Dict[OpOverload, Callable] = {}
+
+ADD_OPS = [
+    torch.ops.aten.add,
+    torch.ops.aten.add.Tensor,
+    torch.ops.aten.add_.Tensor,
+]
+
+
+def register_annotator(ops: List[OpOverload]):
+    def decorator(annotator: Callable):
+        for op in ops:
+            OP_ANNOTATOR[op] = annotator
+
+    return decorator
+
+
+def annotate(graph: Graph, quant_config: QuantizationConfig) -> None:
+    # Pattern annotation
+    _annotate_fused_activation_pattern(graph, quant_config)
+
+    # Per-op annotation
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            annotate_placeholder(node, quant_config)
+        elif node.op == "call_function":
+            annotate_func = OP_ANNOTATOR.get(node.target, None)
+            if annotate_func is not None:
+                annotate_func(node, quant_config)
+
+
+def _is_annotated(nodes: List[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    return True if any of the node
+    is annotated, otherwise return False
+    """
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def _is_fake_tensor(node: Node):
+    if (
+        isinstance(node, Node)
+        and "val" in node.meta
+        and isinstance(node.meta["val"], FakeTensor)
+    ):
+        return True
+    return False
+
+
+def _is_float_tensor(node: Node):
+    """Check if the node's tensor is a float tensor,
+    so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if not _is_fake_tensor(node):
+        return False
+    return node.meta["val"].dtype in [torch.float32, torch.float16]
+
+
+def _mark_nodes_as_annotated(nodes: List[Node]):
+    for node in nodes:
+        if "quantization_annotation" not in node.meta:
+            node.meta["quantization_annotation"] = QuantizationAnnotation()
+        node.meta["quantization_annotation"]._annotated = True
+
+
+# for nodes whose targets ars placehold (not call_function)
+def annotate_placeholder(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    if _is_float_tensor(node):
+        annotate_output_qspec(node, quant_config.output_activation)
+
+    _mark_nodes_as_annotated([node])
+
+
+# CASE 1: fused_activation case (ex. Conv2D + ReLU)
+def _is_hardtanh_for_relux(relu_node: torch.fx.node.Node):
+    if relu_node.target in [
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+    ]:
+        # checking if hardtanh is convertable to ReLU6
+        # ReLU1 is not supported now
+        if not relu_node.args[1] == 0.0:
+            return False
+        if relu_node.args[2] == 6.0:  # for ReLU6
+            return True
+    return True
+
+
+def _annotate_fused_activation_pattern(
+    graph: Graph, quant_config: QuantizationConfig
+) -> None:
+    for relu_node in graph.nodes:
+        # Check relu/relu6 node
+        if relu_node.op != "call_function":
+            continue
+        if relu_node.target not in [
+            # The strategy of ReLU and ReLU6 is fold_activation in ENNQuant
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+            torch.ops.aten.relu6.default,
+            torch.ops.aten.relu6_.default,
+            torch.ops.aten.hardtanh.default,
+            torch.ops.aten.hardtanh_.default,
+        ]:
+            continue
+
+        if not _is_hardtanh_for_relux(relu_node):
+            continue
+
+        producer_node = relu_node.args[0]
+        if not isinstance(producer_node, Node):
+            continue
+        if producer_node.op != "call_function":
+            continue
+        if len(producer_node.users) != 1:
+            continue
+
+        # Handle affine + relu fusion
+        if producer_node.target in [
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.linear.default,
+        ]:
+            # input & weight (or bias) setting for Conv node(producer_node)
+            quantization_annotation = producer_node.meta.get(
+                "quantization_annotation", QuantizationAnnotation()
+            )
+            if quantization_annotation.input_qspec_map is None:
+                quantization_annotation.input_qspec_map = {}
+
+            input = producer_node.args[0]
+            quantization_annotation.input_qspec_map[input] = (
+                quant_config.input_activation
+            )
+
+            quantization_annotation.input_qspec_map[producer_node.args[1]] = (
+                quant_config.weight
+            )
+            if len(producer_node.args) > 2 and quant_config.bias is not None:
+                quantization_annotation.input_qspec_map[producer_node.args[2]] = (
+                    quant_config.bias
+                )
+
+            producer_node.meta["quantization_annotation"] = quantization_annotation
+            producer_node.meta["quantization_annotation"]._annotated = True
+            # out setting for activation node (relu_node)
+            quantization_annotation = relu_node.meta.get(
+                "quantization_annotation", QuantizationAnnotation()
+            )
+            quantization_annotation.output_qspec = quant_config.output_activation
+
+            relu_node.meta["quantization_annotation"] = quantization_annotation
+            relu_node.meta["quantization_annotation"]._annotated = True
+            continue
+
+
+# CASE 2-1: two input case without Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.div,
+        torch.ops.aten.div.Tensor,
+        torch.ops.aten.divide.Tensor,
+        torch.ops.aten.matmul.default,
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.sum.dim_IntList,
+    ]
+)
+def annotate_2in1out(node: Node, quant_config: QuantizationConfig) -> None:
+    input_act0 = node.args[0]
+    input_act1 = node.args[1]
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input_act0):
+        return
+
+    input_act_qspec = quant_config.input_activation
+    output_act_qspec = (
+        quant_config.output_activation if _is_float_tensor(node) else None
+    )
+
+    input_qspec_map = {}
+    if _is_float_tensor(input_act0):
+        input_qspec_map[input_act0] = input_act_qspec
+
+    if _is_float_tensor(input_act1):
+        input_qspec_map[input_act1] = input_act_qspec
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_act_qspec,
+        _annotated=True,
+    )
+
+
+# getting QuantAnnot though the first input
+def _get_quantization_annotation(node: Node):
+    if node.op == "placeholder":
+        return False
+    elif "quantization_annotation" in node.meta:
+        return node
+    elif node.args == ():
+        return False
+    elif isinstance(node.args[0], Node):
+        return _get_quantization_annotation(node.args[0])
+    elif isinstance(node.args[0], list):
+        # for cat, concatenate and stack
+        if isinstance(node.args[0][0], Node):
+            return _get_quantization_annotation(node.args[0][0])
+        else:
+            return False
+    else:
+        return False
+
+
+# CASE 2-2: two input case with Shared Quant
+# ops.add / ops.add_ are processed by another annotator
+@register_annotator(
+    [
+        torch.ops.aten.sub,
+        torch.ops.aten.mul,
+        torch.ops.aten.sub.Tensor,
+        torch.ops.aten.mul.Tensor,
+        torch.ops.aten.sub_.Tensor,
+        torch.ops.aten.mul_.Tensor,
+        torch.ops.aten.rsub.Scalar,
+        torch.ops.aten.mul.Scalar,
+    ]
+)
+def annotate_2in1out_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+
+    input_qspec_map = {}
+    input0 = node.args[0]
+    input1 = node.args[1]
+
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input0):
+        return
+    if (
+        isinstance(input0, Node)
+        and isinstance(input1, float)
+        and not _get_quantization_annotation(input0)
+    ):
+        return
+    if (
+        isinstance(input0, float)
+        and isinstance(input1, Node)
+        and not _get_quantization_annotation(input1)
+    ):
+        return
+    if isinstance(input0, Node) and isinstance(input1, Node):
+        shared_qspec = SharedQuantizationSpec((input0, node))
+        input_qspec_map[input0] = quant_config.input_activation
+        input_qspec_map[input1] = shared_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+    else:
+        input_act_qspec = quant_config.input_activation
+        output_act_qspec = (
+            quant_config.output_activation if _is_float_tensor(node) else None
+        )
+
+        input_qspec_map = {}
+        input_act0 = node.args[0]
+        if _is_float_tensor(input_act0):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = node.args[1]
+        if _is_float_tensor(input_act1):
+            input_qspec_map[input_act1] = input_act_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 2-3: only for add ops
+@register_annotator(ADD_OPS)
+def annotate_add_ops_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+
+    input_qspec_map = {}
+    input0 = node.args[0]
+    input1 = node.args[1]
+
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input0):
+        return
+
+    if isinstance(input0, Node) and isinstance(input1, Node):
+        NonQuantShare_ops_for_add = [torch.ops.aten.dropout.default] + ADD_OPS
+        if (
+            input0.op == "call_function" and input0.target in NonQuantShare_ops_for_add
+        ) or (
+            input1.op == "call_function" and input1.target in NonQuantShare_ops_for_add
+        ):
+            input_act_qspec = quant_config.input_activation
+            output_act_qspec = (
+                quant_config.output_activation if _is_float_tensor(node) else None
+            )
+
+            input_qspec_map = {}
+            input_act0 = node.args[0]
+            if _is_float_tensor(input_act0):
+                input_qspec_map[input_act0] = input_act_qspec
+
+            input_act1 = node.args[1]
+            if _is_float_tensor(input_act1):
+                input_qspec_map[input_act1] = input_act_qspec
+
+            node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=output_act_qspec,
+                _annotated=True,
+            )
+        else:
+            shared_qspec = SharedQuantizationSpec((input0, node))
+            input_qspec_map[input0] = quant_config.input_activation
+            input_qspec_map[input1] = shared_qspec
+
+            node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=shared_qspec,
+                _annotated=True,
+            )
+    elif (
+        isinstance(input0, Node)
+        and isinstance(input1, float)
+        and not _get_quantization_annotation(input0)
+    ):
+        pass
+    elif (
+        isinstance(input0, float)
+        and isinstance(input1, Node)
+        and not _get_quantization_annotation(input1)
+    ):
+        pass
+    else:
+        input_act_qspec = quant_config.input_activation
+        output_act_qspec = (
+            quant_config.output_activation if _is_float_tensor(node) else None
+        )
+
+        input_qspec_map = {}
+        input_act0 = node.args[0]
+        if _is_float_tensor(input_act0):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = node.args[1]
+        if _is_float_tensor(input_act1):
+            input_qspec_map[input_act1] = input_act_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 3-1: Single input + Single Out case without Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.ceil.default,
+        torch.ops.aten.clamp.default,
+        torch.ops.aten.relu.default,
+        torch.ops.aten.relu_.default,
+        torch.ops.aten.relu6.default,
+        torch.ops.aten.relu6_.default,
+        torch.ops.aten.cos.default,
+        torch.ops.aten.sin.default,
+        torch.ops.aten.tanh.default,
+        torch.ops.aten.hardswish.default,
+        torch.ops.aten.hardswish_.default,
+        torch.ops.aten.hardsigmoid.default,
+        torch.ops.aten.hardsigmoid_.default,
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.mean.default,
+        torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.avg_pool2d.default,
+        torch.ops.aten.leaky_relu.default,
+        torch.ops.aten.leaky_relu_.default,
+        torch.ops.aten.prelu.default,
+        torch.ops.aten.upsample_bilinear2d.vec,
+        torch.ops.aten.upsample_nearest2d.vec,
+        torch.ops.aten.mean.dim,
+        torch.ops.aten.sqrt.default,
+        torch.ops.aten.gelu.default,
+        torch.ops.aten.scaled_dot_product_attention.default,
+        torch.ops.aten.rsqrt.default,
+        torch.ops.aten.pow.Tensor_Scalar,
+        torch.ops.aten.topk.default,
+    ]
+)
+def annotate_1in1out(node: Node, quant_config: QuantizationConfig) -> None:
+    # skipping quantization if input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(node.args[0]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    # one inputs + one output case.
+    input_act_qspec = quant_config.input_activation
+    quantization_annotation.input_qspec_map[node.args[0]] = input_act_qspec
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 3-2: Single input + Single Out case with Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.permute.default,
+        torch.ops.aten.view.default,
+        torch.ops.aten._unsafe_view.default,
+        torch.ops.aten.squeeze.default,
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze_copy.dims,
+        torch.ops.aten.unsqueeze.default,
+        torch.ops.aten.unsqueeze_copy.default,
+        torch.ops.aten.transpose.int,
+        torch.ops.aten.expand.default,
+        torch.ops.aten.max_pool2d.default,
+        torch.ops.aten.max_pool2d_with_indices.default,
+        torch.ops.aten.reshape.default,
+        torch.ops.aten.select.int,
+        torch.ops.aten.flatten.using_ints,
+        torch.ops.aten.pad.default,
+        torch.ops.aten.slice.Tensor,
+        torch.ops.aten.to.dtype,
+    ]
+)
+def annotate_1in1out_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    shared_qspec = SharedQuantizationSpec((input, node))
+
+    # get QuantAnnot from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node)
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+    else:
+        # if no QuantAnnot in the input path
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 3-3: Single input + Single Out case with FP
+@register_annotator(
+    [
+        torch.ops.aten.softmax.int,
+        torch.ops.aten._softmax.default,
+        torch.ops.aten._safe_softmax.default,
+        torch.ops.aten.log_softmax.int,
+    ]
+)
+def annotate_1in1out_with_SharedQuant_for_FP(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    if input.target in ADD_OPS and _is_annotated([input]):
+        del input.meta["quantization_annotation"]
+
+    # get QuantAnnot from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        # if QuantAnnot in the input path, input_qspec is shared, but output_qspec is not.
+        input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node)
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quant_config.output_activation,
+            _annotated=True,
+        )
+    else:
+        # if no QuantAnnot in the input path
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=quant_config.output_activation,
+            _annotated=True,
+        )
+
+
+# CASE 4: One value input + one index input with Shared Quant
+@register_annotator([torch.ops.aten.index.Tensor])
+def annotate_index(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnt from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+        input_qspec_map[input] = quant_config.input_activation
+
+        # sharing QuantAnnot with the parent
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 5 input + index + value & output with Shared Quant
+@register_annotator(
+    [torch.ops.aten.index_put.default, torch.ops.aten.index_put_.default]
+)
+def annotate_index_put(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]  # from KVCache in LLAMA
+    value = node.args[2]  # from linear projection layer
+    assert isinstance(input, Node)
+    assert isinstance(value, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnot from input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+        input_qspec_map[input] = shared_qspec
+        input_qspec_map[value] = shared_qspec
+        output_qspec = shared_qspec
+    else:
+        # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config.
+        input_qspec_map[input] = quant_config.input_activation
+        input_qspec_map[value] = SharedQuantizationSpec((input, node))
+        output_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 6 unbind + getitem case
+# (inputQuant--unbinde--no Qunat) --> (no Qunat--getitem--outputQuant)
+@register_annotator([torch.ops.aten.unbind.int])
+def annotate_unbind(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnot from input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+    else:
+        # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config.
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+    for users_node in node.users:
+        users_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 7: stand-alone Conv2d and Conv1d
+@register_annotator(
+    [
+        torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.linear.default,
+    ]
+)
+def annotate_conv2d(node: Node, quant_config: QuantizationConfig) -> None:
+    # skipping quantization if weights are not float
+    if _is_annotated([node]) or not _is_float_tensor(node.args[1]):
+        return
+
+    input = node.args[0]
+    # input & weight (or bias) setting for Conv node(producer_node)
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        quantization_annotation.input_qspec_map[input] = SharedQuantizationSpec(
+            shared_quant_node
+        )
+    else:
+        quantization_annotation.input_qspec_map[input] = quant_config.input_activation
+    quantization_annotation.input_qspec_map[node.args[1]] = quant_config.weight
+    if len(node.args) > 2 and quant_config.bias is not None:
+        quantization_annotation.input_qspec_map[node.args[2]] = quant_config.bias
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 8: embedding
+@register_annotator([torch.ops.aten.embedding.default])
+def annotate_embedding(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    weight = node.args[0]
+    if _is_annotated([node]) or not _is_float_tensor(weight):
+        return
+
+    input_qspec_map[weight] = quant_config.input_activation
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=quant_config.output_activation,
+        _annotated=True,
+    )
+
+
+# CASE 9: Concat & Stack
+@register_annotator(
+    [
+        torch.ops.aten.cat.default,
+        torch.ops.aten.concat.default,
+        torch.ops.aten.stack.default,
+    ]
+)
+def annotate_cat(node: Node, quant_config: QuantizationConfig) -> None:
+    inputs = node.args[0]
+    first_input = inputs[0]
+    assert isinstance(inputs, list)
+    assert isinstance(first_input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(first_input):
+        return
+
+    input_qspec_map = {}
+    shared_qspec = SharedQuantizationSpec((first_input, node))
+    for input in inputs:
+        if input == first_input:
+            input_qspec_map[input] = quant_config.input_activation
+        else:
+            input_qspec_map[input] = shared_qspec
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 10: various normalizations
+@register_annotator([torch.ops.aten.rms_norm.default])
+def annotate_rms_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten.group_norm.default])
+def annotate_group_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.weight
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten.layer_norm.default])
+def annotate_layer_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default])
+def annotate_batch_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+
+    quantization_annotation.input_qspec_map[node.args[1]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 11: Sigmoid
+@register_annotator([torch.ops.aten.sigmoid, torch.ops.aten.sigmoid.default])
+def annotate_sigmoid(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    input_qspec_map = {}
+    input_act = node.args[0]
+    input_qspec_map[input_act] = quant_config.input_activation
+
+    assert isinstance(input_act, Node)
+    out_qconf = quant_config.output_activation
+
+    q_max = (
+        torch.iinfo(out_qconf.dtype).max
+        if out_qconf.quant_max is None
+        else out_qconf.quant_max
+    )
+    q_min = (
+        torch.iinfo(out_qconf.dtype).min
+        if out_qconf.quant_min is None
+        else out_qconf.quant_min
+    )
+
+    scale = 1 / (q_max - q_min + 1)
+
+    bias_obs_ctr = FixedQParamsObserver.with_args(
+        scale=scale,
+        zero_point=0,
+        dtype=quant_config.output_activation.dtype,
+        qscheme=torch.torch.per_tensor_affine,
+        quant_max=q_max,
+        quant_min=q_min,
+    )
+
+    # make sigmoid map to the range between 0~1
+    out_act_quantization_spec = QuantizationSpec(
+        dtype=quant_config.output_activation.dtype,
+        quant_max=q_max,
+        quant_min=q_min,
+        observer_or_fake_quant_ctr=bias_obs_ctr,
+        qscheme=torch.torch.per_tensor_affine,
+    )
+
+    if _is_float_tensor(node):
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=out_act_quantization_spec,
+            _annotated=True,
+        )
diff --git a/backends/samsung/quantizer/qconfig.py b/backends/samsung/quantizer/qconfig.py
new file mode 100644
index 00000000000..f32c8d39796
--- /dev/null
+++ b/backends/samsung/quantizer/qconfig.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from enum import IntEnum, unique
+from typing import Callable, Optional
+
+import torch
+from torchao.quantization.pt2e import (
+    FakeQuantize,
+    MinMaxObserver,
+    PerChannelMinMaxObserver,
+)
+from torchao.quantization.pt2e.quantizer import QuantizationSpec
+
+
+@unique
+class Precision(IntEnum):
+    A8W8 = 3
+
+
+@dataclass(eq=True, frozen=True)
+class QuantizationConfig:
+    input_activation: Optional[QuantizationSpec]
+    output_activation: Optional[QuantizationSpec]
+    weight: Optional[QuantizationSpec]
+    bias: Optional[QuantizationSpec | Callable]
+
+
+def get_quant_config(
+    precision: Precision,
+    is_per_channel: bool = False,
+    is_qat: bool = False,
+) -> QuantizationConfig:
+
+    precision_mappings = {
+        Precision.A8W8: get_a8w8_enn_quant_config,
+    }
+    if precision not in precision_mappings:
+        raise RuntimeError("Unrecognized precision setting.")
+
+    is_weight_symm = is_per_channel
+
+    qconfig_fn = precision_mappings[precision]
+    return qconfig_fn(is_per_channel, is_qat, wei_symmetric=is_weight_symm)
+
+
+def _get_activation_qspec(
+    dtype,
+    is_symmetric,
+    is_qat,
+    observer_cls=MinMaxObserver,
+    quant_min=None,
+    quant_max=None,
+):
+    eps_value = 2**-12
+    if quant_max is None:
+        quant_max = torch.iinfo(dtype).max
+    if quant_min is None:
+        quant_min = torch.iinfo(dtype).min
+
+    qscheme = torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine
+    if is_qat:
+        observer_or_fake_quant = FakeQuantize.with_args(
+            observer=observer_cls, eps=eps_value
+        )
+    else:
+        observer_or_fake_quant = observer_cls.with_args(eps=eps_value)
+
+    return QuantizationSpec(
+        dtype=dtype,
+        quant_min=quant_min,
+        quant_max=quant_max,
+        qscheme=qscheme,
+        observer_or_fake_quant_ctr=observer_or_fake_quant,
+    )
+
+
+def _get_weight_qspec(
+    dtype, is_symmetric, is_per_channel, is_qat, quant_min=None, quant_max=None
+):
+    assert is_symmetric or not is_per_channel, "Not support asymm+perchannel mode"
+
+    eps_value = 2**-12
+
+    if quant_max is None:
+        quant_max = torch.iinfo(dtype).max
+    if quant_min is None:
+        quant_min = torch.iinfo(dtype).min
+
+    if not is_per_channel:
+        qscheme = (
+            torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine
+        )
+        observer_cls = MinMaxObserver
+    else:
+        qscheme = (
+            torch.per_channel_symmetric if is_symmetric else torch.per_channel_affine
+        )
+        observer_cls = PerChannelMinMaxObserver
+
+    if is_qat:
+        observer_or_fake_quant = FakeQuantize.with_args(
+            observer=observer_cls, eps=eps_value
+        )
+    else:
+        observer_or_fake_quant = observer_cls.with_args(eps=eps_value)
+
+    return QuantizationSpec(
+        dtype=dtype,
+        quant_min=quant_min,
+        quant_max=quant_max,
+        qscheme=qscheme,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=observer_or_fake_quant,
+    )
+
+
+def get_a8w8_enn_quant_config(
+    is_per_channel=True, is_qat=False, act_symmetric=False, wei_symmetric=False
+) -> QuantizationConfig:
+    act_quantization_spec = _get_activation_qspec(torch.int8, act_symmetric, is_qat)
+    wgt_quantization_spec = _get_weight_qspec(
+        torch.int8, wei_symmetric, is_per_channel, is_qat
+    )
+    bias_quantization_spec = None
+    quantization_config = QuantizationConfig(
+        input_activation=act_quantization_spec,
+        output_activation=act_quantization_spec,
+        weight=wgt_quantization_spec,
+        bias=bias_quantization_spec,
+    )
+    return quantization_config
+
+
+class QuantInfo:
+    def __init__(self, torch_dtype: torch.dtype, string: str):
+        self._torch_dtype = torch_dtype
+        self._string = string
+
+    @property
+    def torch_dtype(self):
+        return self._torch_dtype
+
+    @property
+    def string(self):
+        return self._string
+
+
+class QuantInfoManager:
+    QUANT_INFO_MAP = {
+        Precision.A8W8: (QuantInfo(torch.int8, "INT8"), QuantInfo(torch.int8, "INT8")),
+    }
+    FP_INFO = (
+        QuantInfo(torch.float32, "FLOAT32"),
+        QuantInfo(torch.float32, "FLOAT32"),
+    )
+
+    def __init__(self):
+        self.precision = None
+
+    def set_precision(self, precision: Precision):
+        self.precision = precision
+
+    @property
+    def weight_precison(self) -> Optional[QuantInfo]:
+        return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[0]
+
+    @property
+    def act_precision(self) -> Optional[QuantInfo]:
+        return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[1]
diff --git a/backends/samsung/quantizer/quantizer.py b/backends/samsung/quantizer/quantizer.py
new file mode 100644
index 00000000000..cf46677d000
--- /dev/null
+++ b/backends/samsung/quantizer/quantizer.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Sequence
+
+import torch
+from torch.fx import GraphModule
+from torchao.quantization.pt2e.quantizer import Quantizer
+
+from .annotator import annotate
+from .qconfig import get_quant_config, Precision, QuantInfoManager
+
+
+global_quant_info = QuantInfoManager()
+
+
+class EnnQuantizer(Quantizer):
+
+    def __init__(self):
+        super().__init__()
+
+        self._precision = Precision.A8W8
+        global_quant_info.set_precision(self._precision)
+        self._is_per_channel = True
+        self._is_qat = False
+        self.custom_quant_annotations: Sequence[Callable] = []
+
+    def setup_precision(self, quant_dtype: Precision) -> None:
+        assert quant_dtype in Precision, f"No support for Precision {quant_dtype}."
+        self._precision = quant_dtype
+        global_quant_info.set_precision(self._precision)
+
+    def setup_quant_params(
+        self, quant_dtype: Precision, is_per_channel=True, is_qat=False
+    ) -> None:
+        assert quant_dtype in Precision, f"No support for Precision {quant_dtype}."
+        self._precision = quant_dtype
+        self._is_per_channel = is_per_channel
+        self._is_qat = is_qat
+
+    def annotate(self, model: GraphModule) -> GraphModule:
+        self._annotate(model)
+        self._annotate_custom_annotation(model)
+        return model
+
+    def _annotate(self, gm: GraphModule) -> None:
+        quant_config = get_quant_config(
+            self._precision, self._is_per_channel, self._is_qat
+        )
+        annotate(gm.graph, quant_config)
+
+    def add_custom_quant_annotations(
+        self, custom_quant_annotations: Sequence[Callable]
+    ) -> None:
+        self.custom_quant_annotations = custom_quant_annotations
+
+    def _annotate_custom_annotation(self, gm: GraphModule) -> None:
+        for annotation_func in self.custom_quant_annotations:
+            annotation_func(gm)
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        return
diff --git a/backends/samsung/serialization/compile_options.py b/backends/samsung/serialization/compile_options.py
index 1ad2350cfeb..a4af40368e9 100644
--- a/backends/samsung/serialization/compile_options.py
+++ b/backends/samsung/serialization/compile_options.py
@@ -11,7 +11,8 @@
 from dataclasses import dataclass
 from enum import IntEnum, unique
 
-import pkg_resources
+from importlib.resources import files
+
 from executorch.exir._serialize._dataclass import _DataclassEncoder
 from executorch.exir._serialize._flatbuffer import _flatc_compile
 from executorch.exir.backend.backend_details import CompileSpec
@@ -36,12 +37,15 @@ def gen_samsung_backend_compile_spec_core(options: EnnExecuTorchOptions) -> Comp
     with tempfile.TemporaryDirectory() as d:
         # schema
         schema_path = os.path.join(d, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME))
+
+        schema_content = (
+            files(__package__)
+            .joinpath(f"{COMPILE_OPTION_SCHEMA_NAME}.fbs")
+            .read_bytes()
+        )
+
         with open(schema_path, "wb") as schema_file:
-            schema_file.write(
-                pkg_resources.resource_string(
-                    __name__, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME)
-                )
-            )
+            schema_file.write(schema_content)
         # dump json
         json_path = os.path.join(d, "{}.json".format(COMPILE_OPTION_SCHEMA_NAME))
         enn_options_json = json.dumps(options, cls=_DataclassEncoder, indent=4)
diff --git a/backends/samsung/serialization/enn_graph_schema.py b/backends/samsung/serialization/enn_graph_schema.py
index 7e74182f9d7..5209a8672ee 100644
--- a/backends/samsung/serialization/enn_graph_schema.py
+++ b/backends/samsung/serialization/enn_graph_schema.py
@@ -5,13 +5,16 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import executorch.backends.samsung.python.PyGraphWrapperAdaptor as PyGraphWrapper
 
 import numpy as np
 
 import torch
+from executorch.backends.samsung.builders.utils import DATA_TYPE_STR_MAPPING
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.samsung.utils.utils import quantize_tensor
 
 
 class EnnGraph:
@@ -24,6 +27,10 @@ def __init__(self):
         self.inputs = []
         self.outputs = []
 
+    def init(self, name: str, soc_name):
+        self.name = name
+        self.soc_name = soc_name
+
     def define_op(
         self,
         name,
@@ -46,22 +53,54 @@ def define_op(
                     py_param_wrapper.SetScalarValue(params[key])
                 else:
                     logging.error("Unsupported param type.")
+                # Set
                 op.AddOpParam(py_param_wrapper)
 
         self.graph.DefineOpNode(op)
 
-    def define_tensor(
+    def define_tensor(  # noqa: C901
         self,
         name: str,
         shape: List,
         data_type: str,
         tensor_type: str,
         data: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        quant_param: Optional[Dict[str, Any]] = None,
     ) -> int:
         layout = "NCHW" if len(shape) == 4 else "UNDEFINED"
 
+        if quant_param is not None:
+            data_type = DATA_TYPE_STR_MAPPING[
+                quant_param[QuantConstants.QUANT_KEY.quant_dtype]
+            ]
+
         tensor = PyGraphWrapper.PyEnnTensorWrapper(name, shape, data_type, layout)
 
+        if quant_param is not None:
+            need_quantize = True
+
+            scales = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.scale]
+            )
+            zero_points = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.zero_point]
+            )
+            q_dtype = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.quant_dtype]
+            )
+            tensor.AddQuantizeParam(q_dtype, scales, zero_points)
+
+            if need_quantize and data is not None:
+                if isinstance(data, np.ndarray):
+                    data = torch.tensor(data)
+                data = quantize_tensor(
+                    data,
+                    scales,
+                    zero_points,
+                    quant_param[QuantConstants.QUANT_KEY.quant_dtype],
+                    axis=quant_param.get("axis"),
+                )
+
         if data is not None:
             if isinstance(data, torch.Tensor):
                 data = data.detach().numpy()
@@ -83,3 +122,20 @@ def finish(self):
 
     def serialize(self):
         return self.graph.Serialize()
+
+    @staticmethod
+    def _affine_meta_param(param: Any) -> str:
+        type_str_affine_table = {
+            torch.int8: "AINT8",
+        }
+        if isinstance(param, str):
+            return param
+        if isinstance(param, (float, int)):
+            return [param]
+        if hasattr(param, "tolist"):
+            return param.tolist()
+        if isinstance(param, torch.dtype):
+            # Convenient for debugging
+            param = type_str_affine_table.get(param, "")
+
+        return param
diff --git a/backends/samsung/utils/constants.py b/backends/samsung/utils/constants.py
new file mode 100644
index 00000000000..7c3997b9fe2
--- /dev/null
+++ b/backends/samsung/utils/constants.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class QuantConstants:
+    # TODO: check keys
+    class QUANT_KEY:
+        scale = "scales"
+        zero_point = "zero_points"
+        quant_min = "quant_min"
+        quant_max = "quant_max"
+        quant_dtype = "quant_dtype"
+
+    PERCHANNEL_KEY_MAP = {
+        "scales": QUANT_KEY.scale,
+        "zero_points": QUANT_KEY.zero_point,
+        "quant_min": QUANT_KEY.quant_min,
+        "quant_max": QUANT_KEY.quant_max,
+        "dtype": QUANT_KEY.quant_dtype,
+    }
+    # SNC ir always use key 'scales' and 'zero_points'
+    PERTENSOR_KEY_MAP = {
+        "scale": QUANT_KEY.scale,
+        "zero_point": QUANT_KEY.zero_point,
+        "quant_min": QUANT_KEY.quant_min,
+        "quant_max": QUANT_KEY.quant_max,
+        "dtype": QUANT_KEY.quant_dtype,
+    }
+
+    QUANT_OPS_KEY_MAP = {
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default: PERCHANNEL_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: PERTENSOR_KEY_MAP,
+    }
+
+    DEQUANT_OPS_KEY_MAP = {
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: PERCHANNEL_KEY_MAP,
+    }
diff --git a/backends/samsung/utils/export_utils.py b/backends/samsung/utils/export_utils.py
index aaf407ef0b3..39992f2ea2a 100644
--- a/backends/samsung/utils/export_utils.py
+++ b/backends/samsung/utils/export_utils.py
@@ -4,20 +4,30 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional, Tuple
+import logging
+from typing import List, Optional, Tuple
 
 import executorch.exir as exir
 import torch
+from executorch.backends.samsung._passes.fuse_conv_act import FuseConvActPass
+from executorch.backends.samsung._passes.remove_useless_ops import RemoveUselessOpPass
 from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer.quantizer import EnnQuantizer, Precision
+from executorch.backends.transforms.decompose_sdpa import (
+    DecomposeScaledDotProductAttention,
+)
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.backend_details import CompileSpec
-
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_manager import PassType
 from executorch.exir.program._program import to_edge_transform_and_lower
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
 def get_edge_compile_config():
+    # Maybe most ops in non-decomposition list should be added here
+    # TODO: to confirm whether all op in none-decomposed table should be added here
     return EdgeCompileConfig(
         _skip_dim_order=True,
         _core_aten_ops_exception_list=[
@@ -29,24 +39,55 @@ def get_edge_compile_config():
             exir_ops.edge.aten._safe_softmax.default,
             exir_ops.edge.aten.layer_norm.default,
             exir_ops.edge.aten.matmul.default,
+            exir_ops.edge.aten.hardsigmoid.default,
         ],
     )
 
 
+def get_enn_pass_list() -> List[PassType]:
+    return [
+        RemoveUselessOpPass(),
+        RemoveCloneOpsTransform(),
+        FuseConvActPass(),
+    ]
+
+
+def quantize_module(
+    module: torch.nn.Module,
+    inputs,
+    calibration_dataset,
+    precision: Precision,
+    is_per_channel: bool = True,
+    is_qat: bool = False,
+) -> torch.nn.Module:
+    quantizer = EnnQuantizer()
+    quantizer.setup_quant_params(precision, is_per_channel, is_qat)
+    logging.info("Export nn module for quantization...")
+    exported_module = torch.export.export_for_training(module, inputs).module()
+    DecomposeScaledDotProductAttention()(exported_module)
+    logging.info("Quantizing the module...")
+    annotated_module = prepare_pt2e(exported_module, quantizer)
+    for data in calibration_dataset:
+        annotated_module(*data)
+    quantized_module = convert_pt2e(annotated_module, fold_quantize=False)
+    logging.info("Quantizing finished.")
+    return quantized_module
+
+
 def to_edge_transform_and_lower_to_enn(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
+    custom_pass_config: List[PassType] = None,
     compile_specs: Optional[CompileSpec] = None,
 ) -> exir.ExecutorchProgramManager:
-    assert (
-        compile_specs is not None
-    ), "Please provide compile specifications for enn backend"
+    assert compile_specs is not None, "For now, we must deliver complile specs"
     prog = torch.export.export(module, inputs)
-
-    ahead_pass_list = [RemoveCloneOpsTransform()]
+    pass_list = get_enn_pass_list()
+    if custom_pass_config:
+        pass_list.extend(custom_pass_config)
     return to_edge_transform_and_lower(
         prog,
-        ahead_pass_list,
+        pass_list,
         {"forward": [EnnPartitioner(compile_specs)]},
         compile_config=get_edge_compile_config(),
     )
diff --git a/backends/samsung/utils/utils.py b/backends/samsung/utils/utils.py
index 5da9808f38f..bbbec518b2a 100644
--- a/backends/samsung/utils/utils.py
+++ b/backends/samsung/utils/utils.py
@@ -4,12 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List
+from typing import List, Optional, Tuple
 
 import torch
 
 from executorch.backends.transforms.utils import is_param_node
 from executorch.exir.backend.backend_details import CompileSpec
+from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch.export.exported_program import ExportedProgram
 
@@ -35,3 +36,90 @@ def is_graph_output(node: torch.fx.Node) -> bool:
         ):
             return True
     return False
+
+
+def _quantize_per_tensor(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]],
+):
+    assert (
+        len(scales) == 1
+    ), "For per-tensor quantization, there should be only one scale/zeropoint"
+    return exir_ops.edge.quantized_decomposed.quantize_per_tensor.default(
+        in_tensor,
+        torch.Tensor(scales),
+        torch.Tensor(zeropoints),
+        qrange[0],
+        qrange[1],
+        dtype,
+    )
+
+
+def _quantize_per_channel(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]],
+    axis: Optional[int],  # Only for per-channel
+):
+    assert (
+        len(scales) == in_tensor.shape[axis]
+    ), "Shape not match for quant params and input tensor"
+    return exir_ops.edge.quantized_decomposed.quantize_per_channel.default(
+        in_tensor,
+        torch.Tensor(scales),
+        torch.Tensor(zeropoints),
+        axis,
+        qrange[0],
+        qrange[1],
+        dtype,
+    )
+
+
+def quantize_tensor(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]] = None,
+    axis: Optional[int] = None,  # Only for per-channel
+) -> torch.Tensor:
+    """
+    To quantize constant tensor by executorch OPs. If `axis` not set, we quantize the tensor by per tensor.
+    If `axis` was set, we do per-channel quantize.
+
+    :param in_tensor: The tensor to be quantized
+    :param scales: List of scales. For per-tensor quantization, it should contain only one element
+    :param zeropoints: List of zeropoints. For per-tensor quantization, it should contain only one element
+    :param dtype: The output dtype
+    :param qrange: The quantization range (qmin, qmax).
+        If not set, we will get the maximum range of the dtype by `torch.iinfo`
+    :param axis: We do per-channel quantize by which axis.
+        Only when this parameter set, we do per-channel quantization
+    :type in_tensor: torch.Tensor
+    :type scalse: List[float]
+    :type zeropoints: List[int]
+    :type dtype: torch.dtype
+    :type qrange: Optional[Tuple[int,int]]
+    :type axis: Optional[int]
+    :return: The quantized tensor
+    """
+    assert len(scales) == len(
+        zeropoints
+    ), "scales should have same shape with zeropoints"
+    if not qrange:
+        qrange = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
+
+    if axis is not None:
+        return _quantize_per_channel(in_tensor, scales, zeropoints, dtype, qrange, axis)
+    return _quantize_per_tensor(
+        in_tensor,
+        scales,
+        zeropoints,
+        dtype,
+        qrange,
+    )
diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py
index 351bab4a605..02c6fc4c82d 100644
--- a/backends/test/harness/tester.py
+++ b/backends/test/harness/tester.py
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import random
 from collections import Counter, OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -62,6 +67,7 @@ def __init__(
             StageType.RUN_PASSES: [
                 StageType.PARTITION,
                 StageType.TO_EDGE_TRANSFORM_AND_LOWER,
+                StageType.TO_EXECUTORCH,
             ],
             # TODO Make this Stage optional
             StageType.PARTITION: [StageType.TO_EXECUTORCH],
diff --git a/backends/test/suite/README.md b/backends/test/suite/README.md
index 564f44362ad..901cd461dbe 100644
--- a/backends/test/suite/README.md
+++ b/backends/test/suite/README.md
@@ -5,37 +5,71 @@ This directory contains tests that validate correctness and coverage of backends
 These tests are intended to ensure that backends are robust and provide a smooth, "out-of-box" experience for users across the full span of input patterns. They are not intended to be a replacement for backend-specific tests, as they do not attempt to validate performance or that backends delegate operators that they expect to.
 
 ## Running Tests and Interpreting Output
-Tests can be run from the command line, either using the runner.py entry point or the standard Python unittest runner. When running through runner.py, the test runner will report test statistics, including the number of tests with each result type.
+Tests can be run from the command line using pytest. When generating a JSON test report, the runner will report detailed test statistics, including output accuracy, delegated nodes, lowering timing, and more.
 
-Backends can be specified with the `ET_TEST_ENABLED_BACKENDS` environment variable. By default, all available backends are enabled. Note that backends such as Core ML or Vulkan may require specific hardware or software to be available. See the documentation for each backend for information on requirements.
+Each backend and test flow (recipe) registers a pytest [marker](https://docs.pytest.org/en/stable/example/markers.html) that can be passed to pytest with the `-m marker` argument to filter execution.
 
-Example:
+To run all XNNPACK backend operator tests:
 ```
-ET_TEST_ENABLED_BACKENDS=xnnpack python -m executorch.backends.test.suite.runner
+pytest -c /dev/nul backends/test/suite/operators/ -m backend_xnnpack -n auto
 ```
 
+To run all model tests for the CoreML static int8 lowering flow:
+```
+pytest -c /dev/nul backends/test/suite/models/ -m flow_coreml_static_int8 -n auto
 ```
-2465 Passed / 2494
-16 Failed
-13 Skipped
 
-[Success]
-736 Delegated
-1729 Undelegated
+To run a specific test:
+```
+pytest -c /dev/nul backends/test/suite/ -k "test_prelu_f32_custom_init[xnnpack]"
+```
 
-[Failure]
-5 Lowering Fail
-3 PTE Run Fail
-8 Output Mismatch Fail
+To generate a JSON report:
+```
+pytest -c /dev/nul backends/test/suite/operators/ -n auto --json-report --json-report-file="test_report.json"
 ```
 
-Outcomes can be interpreted as follows:
- * Success (delegated): The test passed and at least one op was delegated by the backend.
- * Success (undelegated): The test passed with no ops delegated by the backend. This is a pass, as the partitioner works as intended.
- * Skipped: test fails in eager or export (indicative of a test or dynamo issue).
- * Lowering fail: The test fails in to_edge_transform_and_lower.
- * PTE run failure: The test errors out when loading or running the method.
- * Output mismatch failure: Output delta (vs eager) exceeds the configured tolerance.
+See [pytest-json-report](https://pypi.org/project/pytest-json-report/) for information on the report format. The test logic in this repository attaches additional metadata to each test entry under the `metadata`/`subtests` keys. One entry is created for each call to `test_runner.lower_and_run_model`.
+
+Here is a excerpt from a test run, showing a successful run of the `test_add_f32_bcast_first[xnnpack]` test.
+```json
+"tests": [
+    {
+      "nodeid": "operators/test_add.py::test_add_f32_bcast_first[xnnpack]",
+      "lineno": 38,
+      "outcome": "passed",
+      "keywords": [
+        "test_add_f32_bcast_first[xnnpack]",
+        "flow_xnnpack",
+        "backend_xnnpack",
+        ...
+      ],
+      "metadata": {
+        "subtests": [
+          {
+            "Test ID": "test_add_f32_bcast_first[xnnpack]",
+            "Test Case": "test_add_f32_bcast_first",
+            "Subtest": 0,
+            "Flow": "xnnpack",
+            "Result": "Pass",
+            "Result Detail": "",
+            "Error": "",
+            "Delegated": "True",
+            "Quantize Time (s)": null,
+            "Lower Time (s)": "2.881",
+            "Output 0 Error Max": "0.000",
+            "Output 0 Error MAE": "0.000",
+            "Output 0 SNR": "inf",
+            "Delegated Nodes": 1,
+            "Undelegated Nodes": 0,
+            "Delegated Ops": {
+              "aten::add.Tensor": 1
+            },
+            "PTE Size (Kb)": "1.600"
+          }
+        ]
+      }
+```
 
 ## Backend Registration
 
@@ -43,11 +77,11 @@ To plug into the test framework, each backend should provide an implementation o
 
 At a minimum, the backend will likely need to provide a custom implementation of the Partition and ToEdgeTransformAndLower stages using the appropriate backend partitioner. See backends/xnnpack/test/tester/tester.py for an example implementation.
 
-Once a tester is available, the backend flow(s) can be added in __init__.py in this directory by adding an entry to `ALL_TESTER_FLOWS`. Each flow entry consists of a name (used in the test case naming) and a function to instantiate a tester for a given model and input tuple.
+Once a tester is available, the backend flow(s) can be added under flows/ and registered in flow.py. It is intended that this will be unified with the lowering recipes under executorch/export in the near future.
 
 ## Test Cases
 
-Operator test cases are defined under the operators/ directory. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow. The `@operator_test` decorator is applied to each test class to trigger this behavior. Tests can also be tagged with an appropriate type specifier, such as `@dtype_test`, to generate variants for each dtype. The decorators and "magic" live in __init__.py in this directory.
+Operator test cases are defined under the operators/ directory. Model tests are under models/. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow by use of the `test_runner` fixture parameter. Tests can additionally be parameterized using standard pytest decorators. Parameterizing over dtype is a common use case.
 
 ## Evolution of this Test Suite
 
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
index 05fc760683d..29394951bd7 100644
--- a/backends/test/suite/flow.py
+++ b/backends/test/suite/flow.py
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import logging
 
 from dataclasses import dataclass, field
@@ -122,10 +127,18 @@ def all_flows() -> dict[str, TestFlow]:
         logger.info(f"Skipping QNN flow registration: {e}")
 
     try:
-        from executorch.backends.test.suite.flows.arm import ARM_TOSA_FLOW
+        from executorch.backends.test.suite.flows.arm import (
+            ARM_ETHOS_U55_FLOW,
+            ARM_ETHOS_U85_FLOW,
+            ARM_TOSA_FP_FLOW,
+            ARM_TOSA_INT_FLOW,
+        )
 
         flows += [
-            ARM_TOSA_FLOW,
+            ARM_TOSA_FP_FLOW,
+            ARM_TOSA_INT_FLOW,
+            ARM_ETHOS_U55_FLOW,
+            ARM_ETHOS_U85_FLOW,
         ]
     except Exception as e:
         logger.info(f"Skipping ARM flow registration: {e}")
diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py
index baa2df79de9..85674331eda 100644
--- a/backends/test/suite/flows/arm.py
+++ b/backends/test/suite/flows/arm.py
@@ -1,24 +1,68 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Create flows for Arm Backends used to test operator and model suits
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.util._factory import create_quantizer
 from executorch.backends.test.suite.flow import TestFlow
+from executorch.backends.xnnpack.test.tester.tester import Quantize
 
 
-def _create_arm_tester_tosa_fp(*args, **kwargs) -> ArmTester:
-    kwargs["compile_spec"] = common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+def _create_arm_flow(
+    name,
+    compile_spec: ArmCompileSpec,
+    symmetric_io_quantization: bool = False,
+    per_channel_quantization: bool = True,
+) -> TestFlow:
 
-    return ArmTester(
-        *args,
-        **kwargs,
-    )
+    def _create_arm_tester(*args, **kwargs) -> ArmTester:
+        kwargs["compile_spec"] = compile_spec
+        return ArmTester(*args, **kwargs)
+
+    support_serialize = not isinstance(compile_spec, TosaCompileSpec)
+    quantize = compile_spec.tosa_spec.support_integer()
+
+    if quantize is True:
 
+        def create_quantize_stage() -> Quantize:
+            quantizer = create_quantizer(compile_spec)
+            quantization_config = get_symmetric_quantization_config(
+                is_per_channel=per_channel_quantization
+            )
+            if symmetric_io_quantization:
+                quantizer.set_io(quantization_config)
+            return Quantize(quantizer, quantization_config)
 
-def _create_tosa_flow() -> TestFlow:
     return TestFlow(
-        "arm_tosa",
+        name,
         backend="arm",
-        tester_factory=_create_arm_tester_tosa_fp,
-        supports_serialize=False,
+        tester_factory=_create_arm_tester,
+        supports_serialize=support_serialize,
+        quantize=quantize,
+        quantize_stage_factory=(create_quantize_stage if quantize is True else False),
     )
 
 
-ARM_TOSA_FLOW = _create_tosa_flow()
+ARM_TOSA_FP_FLOW = _create_arm_flow(
+    "arm_tosa_fp",
+    common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
+)
+ARM_TOSA_INT_FLOW = _create_arm_flow(
+    "arm_tosa_int",
+    common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
+)
+ARM_ETHOS_U55_FLOW = _create_arm_flow(
+    "arm_ethos_u55",
+    common.get_u55_compile_spec(),
+)
+ARM_ETHOS_U85_FLOW = _create_arm_flow(
+    "arm_ethos_u85",
+    common.get_u85_compile_spec(),
+)
diff --git a/backends/test/suite/flows/qualcomm.py b/backends/test/suite/flows/qualcomm.py
index 9998caa51b6..99deb3d4877 100644
--- a/backends/test/suite/flows/qualcomm.py
+++ b/backends/test/suite/flows/qualcomm.py
@@ -42,7 +42,7 @@ def create_quantize_stage() -> Quantize:
 
 QNN_TEST_FLOW = _create_qnn_flow("qnn")
 QNN_16A16W_TEST_FLOW = _create_qnn_flow(
-    "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False
+    "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_16a16w, use_fp16=False
 )
 QNN_16A8W_TEST_FLOW = _create_qnn_flow(
     "qnn_16a8w", quantize=True, quant_dtype=QuantDtype.use_16a8w, use_fp16=False
diff --git a/backends/test/suite/operators/__init__.py b/backends/test/suite/operators/__init__.py
index fa5ec2566d4..7475af29e15 100644
--- a/backends/test/suite/operators/__init__.py
+++ b/backends/test/suite/operators/__init__.py
@@ -70,7 +70,9 @@ def __init__(self, test_runner):
         self._test_runner = test_runner
 
     def _test_op(self, model, args, flow, generate_random_test_inputs=True):
-        self._test_runner.lower_and_run_model(model, args)
+        self._test_runner.lower_and_run_model(
+            model, args, generate_random_test_inputs=generate_random_test_inputs
+        )
 
 
 def wrap_test(original_func, test_type):
diff --git a/backends/test/suite/operators/test_lstm.py b/backends/test/suite/operators/test_lstm.py
index 91dd73c9052..11632e1e055 100644
--- a/backends/test/suite/operators/test_lstm.py
+++ b/backends/test/suite/operators/test_lstm.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -15,6 +16,11 @@
     operator_test,
     OperatorTest,
 )
+from torch.nn.quantizable.modules.rnn import LSTM as QuantizableLSTM
+
+
+def _get_lstm_cls(use_quantizable_lstm: bool):
+    return QuantizableLSTM if use_quantizable_lstm else torch.nn.LSTM
 
 
 class Model(torch.nn.Module):
@@ -27,9 +33,11 @@ def __init__(
         batch_first=True,
         dropout=0.0,
         bidirectional=False,
+        use_quantizable_lstm: bool = False,
     ):
         super().__init__()
-        self.lstm = torch.nn.LSTM(
+        lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+        self.lstm = lstm_cls(
             input_size=input_size,
             hidden_size=hidden_size,
             num_layers=num_layers,
@@ -47,106 +55,133 @@ def forward(self, x):
 class LSTM(OperatorTest):
     @dtype_test
     def test_lstm_dtype(self, flow: TestFlow, dtype) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2).to(dtype),
+            Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm).to(dtype),
             ((torch.rand(1, 10, 64) * 10).to(dtype),),  # (batch=1, seq_len, input_size)
             flow,
         )
 
     @dtype_test
     def test_lstm_no_bias_dtype(self, flow: TestFlow, dtype) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2, bias=False).to(dtype),
+            Model(
+                num_layers=2, bias=False, use_quantizable_lstm=use_quantizable_lstm
+            ).to(dtype),
             ((torch.rand(1, 10, 64) * 10).to(dtype),),
             flow,
         )
 
     def test_lstm_feature_sizes(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(input_size=32, hidden_size=16),
+            Model(
+                input_size=32,
+                hidden_size=16,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 8, 32),),  # (batch=1, seq_len, input_size)
             flow,
         )
         self._test_op(
-            Model(input_size=128, hidden_size=64),
+            Model(
+                input_size=128,
+                hidden_size=64,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 12, 128),),
             flow,
         )
         self._test_op(
-            Model(input_size=256, hidden_size=128),
+            Model(
+                input_size=256,
+                hidden_size=128,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 6, 256),),
             flow,
         )
         self._test_op(
-            Model(input_size=16, hidden_size=32),
+            Model(
+                input_size=16,
+                hidden_size=32,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 5, 16),),
             flow,
         )
 
     def test_lstm_batch_sizes(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(8, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(32, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(100, 10, 64),),
             flow,
         )
 
     def test_lstm_seq_lengths(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 5, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 20, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 50, 64),),
             flow,
         )
 
     def test_lstm_batch_first_false(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(batch_first=False),
+            Model(batch_first=False, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(10, 1, 64),),  # (seq_len, batch=1, input_size)
             flow,
         )
 
     def test_lstm_num_layers(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2),
+            Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(num_layers=3),
+            Model(num_layers=3, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
 
     def test_lstm_bidirectional(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(bidirectional=True),
+            Model(bidirectional=True, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
 
     def test_lstm_with_dropout(self, flow: TestFlow) -> None:
         # Note: Dropout is only effective with num_layers > 1
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2, dropout=0.2),
+            Model(num_layers=2, dropout=0.2, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
@@ -154,9 +189,10 @@ def test_lstm_with_dropout(self, flow: TestFlow) -> None:
     def test_lstm_with_initial_states(self, flow: TestFlow) -> None:
         # Create a model that accepts initial states
         class ModelWithStates(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, use_quantizable_lstm: bool = False):
                 super().__init__()
-                self.lstm = torch.nn.LSTM(
+                lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+                self.lstm = lstm_cls(
                     input_size=64,
                     hidden_size=32,
                     num_layers=2,
@@ -169,9 +205,10 @@ def forward(self, x, h0, c0):
         batch_size = 1
         num_layers = 2
         hidden_size = 32
+        use_quantizable_lstm = flow.quantize
 
         self._test_op(
-            ModelWithStates(),
+            ModelWithStates(use_quantizable_lstm=use_quantizable_lstm),
             (
                 torch.randn(batch_size, 10, 64),  # input
                 torch.randn(num_layers, batch_size, hidden_size),  # h0
@@ -183,9 +220,10 @@ def forward(self, x, h0, c0):
     def test_lstm_return_hidden_states(self, flow: TestFlow) -> None:
         # Create a model that returns both output and hidden states
         class ModelWithHiddenStates(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, use_quantizable_lstm: bool = False):
                 super().__init__()
-                self.lstm = torch.nn.LSTM(
+                lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+                self.lstm = lstm_cls(
                     input_size=64,
                     hidden_size=32,
                     num_layers=2,
@@ -200,9 +238,10 @@ def forward(self, x):
         batch_size = 1
         seq_len = 10
         input_size = 64
+        use_quantizable_lstm = flow.quantize
 
         self._test_op(
-            ModelWithHiddenStates(),
+            ModelWithHiddenStates(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(batch_size, seq_len, input_size),),
             flow,
         )
diff --git a/backends/test/suite/operators/test_rsqrt.py b/backends/test/suite/operators/test_rsqrt.py
index 705833194fb..bb51b213dd4 100644
--- a/backends/test/suite/operators/test_rsqrt.py
+++ b/backends/test/suite/operators/test_rsqrt.py
@@ -37,15 +37,28 @@ def test_rsqrt_dtype(self, flow: TestFlow, dtype) -> None:
 
     def test_rsqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
-
         # 1D tensor
-        self._test_op(RsqrtModel(), (torch.rand(20) + 0.01,), flow)
-
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(20) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
         # 2D tensor
-        self._test_op(RsqrtModel(), (torch.rand(5, 10) + 0.01,), flow)
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(5, 10) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
         # 3D tensor
-        self._test_op(RsqrtModel(), (torch.rand(3, 4, 5) + 0.01,), flow)
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(3, 4, 5) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
     @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_rsqrt_edge_cases(self, flow: TestFlow) -> None:
diff --git a/backends/test/suite/operators/test_sqrt.py b/backends/test/suite/operators/test_sqrt.py
index 3d327ade6a5..92fbc64878e 100644
--- a/backends/test/suite/operators/test_sqrt.py
+++ b/backends/test/suite/operators/test_sqrt.py
@@ -39,13 +39,19 @@ def test_sqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
 
         # 1D tensor
-        self._test_op(SqrtModel(), (torch.rand(20),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(20),), flow, generate_random_test_inputs=False
+        )
 
         # 2D tensor
-        self._test_op(SqrtModel(), (torch.rand(5, 10),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(5, 10),), flow, generate_random_test_inputs=False
+        )
 
         # 3D tensor
-        self._test_op(SqrtModel(), (torch.rand(3, 4, 5),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(3, 4, 5),), flow, generate_random_test_inputs=False
+        )
 
     @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_sqrt_edge_cases(self, flow: TestFlow) -> None:
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index 17b2be4e73c..d9acde79ecf 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -111,6 +111,9 @@ file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp)
 
 # vulkan_backend
 
+# Try to find boost to log stack traces when throwing exceptions
+find_package(Boost 1.89 COMPONENTS stacktrace_basic stacktrace_addr2line)
+
 file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
 list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp})
 list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp})
@@ -121,6 +124,14 @@ target_include_directories(
   vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES}
 )
 target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core)
+# Optionally link boost for stacktraces if boost is available
+if(DEFINED Boost_STACKTRACE_BASIC_LIBRARY)
+  target_link_libraries(
+    vulkan_backend PRIVATE ${Boost_STACKTRACE_LIBRARY}
+                           ${Boost_STACKTRACE_ADDR2LINE_LIBRARY}
+  )
+  list(APPEND VULKAN_CXX_FLAGS "-DETVK_BOOST_STACKTRACE_AVAILABLE")
+endif()
 target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS})
 # Link this library with --whole-archive due to dynamic backend registration
 executorch_target_link_options_shared_lib(vulkan_backend)
@@ -132,7 +143,7 @@ set_property(TARGET vulkan_backend PROPERTY CXX_STANDARD 17)
 install(
   TARGETS vulkan_backend vulkan_schema
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${COMMON_INCLUDES}
 )
diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py
index 3beccc2205c..a6a5e751c05 100644
--- a/backends/vulkan/_passes/fold_qdq.py
+++ b/backends/vulkan/_passes/fold_qdq.py
@@ -17,9 +17,8 @@ class FoldQDQPass(ExportPass):
     valid quant op patterns have already been fused before this pass.
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
-        super(FoldQDQPass, self).__init__()
-        self.edge_program = edge_program
+    def __init__(self):
+        super().__init__()
 
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
diff --git a/backends/vulkan/_passes/fuse_patterns.py b/backends/vulkan/_passes/fuse_patterns.py
index 6ced1f32a7c..1575dd6a4f6 100644
--- a/backends/vulkan/_passes/fuse_patterns.py
+++ b/backends/vulkan/_passes/fuse_patterns.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Optional
+
 import executorch.backends.vulkan.patterns as vk_patterns
 
 import torch
@@ -13,13 +15,15 @@
 
 
 class FusePatternsPass(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.program = exported_program
+        self._exported_program: Optional[ExportedProgram] = None
 
     def call(self, graph_module: torch.fx.GraphModule):
+        assert self._exported_program is not None
+
         total_replaced = vk_patterns.replace_all_fusable_subgraphs(
-            self.program, graph_module
+            self._exported_program, graph_module
         )
 
         if total_replaced > 0:
diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py
index ca9f7541159..bb8cf5f2e64 100644
--- a/backends/vulkan/_passes/fuse_quantized_ops.py
+++ b/backends/vulkan/_passes/fuse_quantized_ops.py
@@ -211,18 +211,20 @@ def fuse_into_linear_qcnw_node(
 
 
 class FuseQuantizedOpsTransform(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.program = exported_program
+        self._exported_program: Optional[ExportedProgram] = None
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        assert self._exported_program is not None
+
         for node in graph_module.graph.nodes:
             # Check for linear_qcnw pattern (weight-only quantization)
-            qcnw_details = matches_linear_qcnw_pattern(self.program, node)
+            qcnw_details = matches_linear_qcnw_pattern(self._exported_program, node)
             if qcnw_details is not None:
                 qcnw_method, qcnw_nbits = qcnw_details
                 fuse_into_linear_qcnw_node(
-                    self.program, graph_module, node, qcnw_method, qcnw_nbits
+                    self._exported_program, graph_module, node, qcnw_method, qcnw_nbits
                 )
                 continue
 
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index db53cc666a8..8ed71aa1dae 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -230,6 +230,10 @@ def get_arg_tensor_source_repset(
         """
         arg_node = op_node.args[arg_i]
 
+        # For non-tensor arguments, return ANY_STORAGE
+        if not utils.is_tensor_arg_node(arg_node):
+            return utils.ANY_STORAGE
+
         # Special case for cat - use the first tensor in the list as representative
         if isinstance(arg_node, list):
             arg_node = arg_node[0]
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index a92b3b11f6f..63b57a0e79c 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -16,8 +16,6 @@
 
 import torch
 
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkMemoryLayout
-
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
@@ -48,6 +46,9 @@ class OpFeatures:
         # Optional check function used during partitioning to determine if a node's
         # inputs are supported by the operator implementation.
         "are_node_inputs_supported_fn",
+        # Optional function to determine valid representation sets for input and outputs
+        # once a node's actual inputs are known.
+        "pick_io_storage_fn",
     ]
 
     def __init__(
@@ -61,6 +62,7 @@ def __init__(
         supports_resize: bool = False,
         supports_prepacking: bool = False,
         are_node_inputs_supported_fn: Optional[Callable] = allow_node,
+        pick_io_storage_fn: Optional[Callable] = None,
     ):
         self.inputs_storage: utils.TensorRepSetList = utils.TensorRepSetList(
             inputs_storage if inputs_storage is not None else []
@@ -77,15 +79,21 @@ def __init__(
         self.supports_prepacking = supports_prepacking
 
         self.are_node_inputs_supported_fn = are_node_inputs_supported_fn
+        self.pick_io_storage_fn = pick_io_storage_fn
 
     def make_op_repsets(
         self,
         op_node: torch.fx.Node,
         texture_limits: utils.ImageExtents = utils.DEFAULT_TEXTURE_LIMITS,
     ) -> utils.OpRepSets:
-        return utils.OpRepSets(
-            self.inputs_storage, self.outputs_storage, op_node, texture_limits
-        )
+        inputs_storage = self.inputs_storage
+        outputs_storage = self.outputs_storage
+        if self.pick_io_storage_fn is not None:
+            i_storage, o_storage = self.pick_io_storage_fn(op_node)
+            inputs_storage = utils.TensorRepSetList(i_storage)
+            outputs_storage = utils.TensorRepSetList(o_storage)
+
+        return utils.OpRepSets(inputs_storage, outputs_storage, op_node, texture_limits)
 
 
 #######################
@@ -410,28 +418,16 @@ def register_softmax_op():
 )
 def register_reduce_op():
     def check_reduce_node(node: torch.fx.Node) -> bool:
+        # Only one argument implies that the reduction is over the entire tensor, which
+        # is not supported yet.
+        if len(node.args) == 1:
+            return False
+
         dim_list = node.args[1]
+        # Only 1D and 2D reductions are supported at the moment.
         if isinstance(dim_list, list) and len(dim_list) > 2:
             return False
 
-        if isinstance(dim_list, list) and len(dim_list) == 2:
-            # Try to get the memory layout for this node
-            try:
-                memory_layout = utils.get_node_memory_layout(node)
-
-                # If we have memory layout information, check if any dimension in dim_list corresponds to a packed dimension
-                if (
-                    memory_layout is not None
-                    and memory_layout != VkMemoryLayout.DEFAULT_LAYOUT
-                ):
-                    # For now only default layout is supported for 2D reduction.
-                    # Because we can't determine if the input is NCHW or NHWC here,
-                    # assume the reduction dimension is packed so we cannot support it.
-                    return False
-            except (AssertionError, KeyError, AttributeError):
-                # If we can't get memory layout information, we'll assume the dims aren't packed
-                pass
-
         def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
             for arg in node.args:
                 if isinstance(arg, bool):
@@ -446,10 +442,41 @@ def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
 
         return True
 
+    def pick_io_storage_for_reduce(node: torch.fx.Node):
+        inputs_storage = utils.ANY_TEXTURE
+        outputs_storage = utils.ANY_TEXTURE
+
+        input_tensor = node.args[0]
+        ndim = input_tensor.meta["val"].ndim
+        dim_list = node.args[1]
+        if isinstance(dim_list, list) and len(dim_list) == 2:
+            reduce_dim1_whcn = utils.nchw_dim_to_whcn_dim(dim_list[0], ndim)
+            reduce_dim2_whcn = utils.nchw_dim_to_whcn_dim(dim_list[1], ndim)
+
+            possible_packed_dims = {0, 1, 2}
+            possible_packed_dims.discard(reduce_dim1_whcn)
+            possible_packed_dims.discard(reduce_dim2_whcn)
+
+            packed_dim = possible_packed_dims.pop()
+            assert packed_dim in [0, 1, 2]
+
+            if packed_dim == 0:
+                inputs_storage = utils.WIDTH_PACKED_TEXTURE
+                outputs_storage = utils.WIDTH_PACKED_TEXTURE
+            elif packed_dim == 1:
+                inputs_storage = utils.HEIGHT_PACKED_TEXTURE
+                outputs_storage = utils.HEIGHT_PACKED_TEXTURE
+            else:
+                inputs_storage = utils.CHANNELS_PACKED_TEXTURE
+                outputs_storage = utils.CHANNELS_PACKED_TEXTURE
+
+        return inputs_storage, outputs_storage
+
     return OpFeatures(
         inputs_storage=utils.ANY_TEXTURE,
         supports_resize=True,
         are_node_inputs_supported_fn=check_reduce_node,
+        pick_io_storage_fn=pick_io_storage_for_reduce,
     )
 
 
@@ -474,6 +501,23 @@ def register_2d_pool_op():
     ]
 )
 def register_convolution_op():
+    def check_conv_node(node: torch.fx.Node) -> bool:
+        x = node.args[0]
+        x_shape = x.meta["val"].size()
+        # 4-D input implies 2D convolution
+        if len(x_shape) == 4:
+            batches = x.meta["val"].size()[0]
+            if batches != 1:
+                return False
+        # 3-D input implies 1D convolution
+        if len(x_shape) == 3:
+            transpose = node.args[6]
+            # Transposed 1D convolution is not supported yet
+            if transpose:
+                return False
+
+        return True
+
     return OpFeatures(
         inputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE,  # input
@@ -490,6 +534,7 @@ def register_convolution_op():
         ],
         supports_resize=True,
         supports_prepacking=True,
+        are_node_inputs_supported_fn=check_conv_node,
     )
 
 
@@ -716,6 +761,7 @@ def register_ported_ops_with_prepacking():
     return OpFeatures(
         inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
         supports_prepacking=True,
+        supports_resize=True,
     )
 
 
@@ -746,6 +792,7 @@ def register_ported_ops_with_prepacking_all_dims():
     return OpFeatures(
         inputs_storage=utils.ANY_TEXTURE,
         supports_prepacking=True,
+        supports_resize=True,
     )
 
 
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index e5b2d0f7864..0bdc16616ef 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -36,7 +36,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch.export.exported_program import ExportedProgram
@@ -254,9 +254,10 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool:  # noqa: C901
             self.log_skip(node, "permute node of non compatible linear node")
             return False
 
-        is_in_local_scalar_dense_chain, dst_node_is_compatible = (
-            self.is_in_local_scalar_dense_chain(node)
-        )
+        (
+            is_in_local_scalar_dense_chain,
+            dst_node_is_compatible,
+        ) = self.is_in_local_scalar_dense_chain(node)
         if is_in_local_scalar_dense_chain and dst_node_is_compatible:
             return True
         elif is_in_local_scalar_dense_chain:
@@ -419,6 +420,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.")
 
         tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
index 882d0d41e6d..374e29c634d 100644
--- a/backends/vulkan/patterns/quantized_linear.py
+++ b/backends/vulkan/patterns/quantized_linear.py
@@ -92,9 +92,11 @@ def __init__(self, mm_node: torch.fx.Node) -> None:
             return
 
         # Identify input node
-        self.fp_input_node, self.quantize_input_node, dq_node = (
-            utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
-        )
+        (
+            self.fp_input_node,
+            self.quantize_input_node,
+            dq_node,
+        ) = utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
         assert self.fp_input_node is not None
         self.all_nodes.append(self.fp_input_node)
 
@@ -386,7 +388,7 @@ def make_linear_dq8ca_q4gsw_op(
         weight_sums_node = create_constant_placeholder(
             exp_program=ep,
             graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
+            kind=InputKind.PARAMETER,
             name=sums_name,
             data=sum_per_quant_group,
         )
@@ -429,7 +431,7 @@ def make_linear_q8ta_q8csw_custom_op(
         weight_sums_node = create_constant_placeholder(
             exp_program=ep,
             graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
+            kind=InputKind.PARAMETER,
             name=sums_name,
             data=sum_per_output_channel,
         )
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
index 0f5dbc41273..88746c5594e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -60,7 +60,7 @@ void main() {
     int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y;
     start.y = ipos.y + num_steps * dilation.y;
   }
-  const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy));
+  const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
   // Compute the start of the kernel based on how far we are skipping ahead when
   // reading the input. Note that these are "canonical" indices.
   ivec2 kstart = (start - ipos) / dilation;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index 02fbef29b75..9089f87d658 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -54,7 +54,7 @@ void main() {
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so reads from the padding region are skipped.
   const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
+  const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
 
   VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   int kx = 0;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index 19250419baf..7448b042cad 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -97,6 +97,10 @@ void main() {
   for (int y = start.y, i = 0; i < TILE_SIZE + BATCH_SIZE_Y - 1; y += dilation.y, i++) {
     for (int x = start.x, j = 0; j < TILE_SIZE + BATCH_SIZE_X - 1; x += dilation.x, j++) {
       in_texels[j] = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      // Set to zero if reading out of bounds
+      if (any(greaterThanEqual(ivec2(x, y), in_sizes.xy))) {
+        in_texels[j] = VEC4_T(0);
+      }
     }
 
     // from 2nd iteration onwards accumulate dot product in 2nd sum
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
index eff78a7938d..1a5b0cb235e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
@@ -14,5 +14,6 @@ full:
     DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: int32
   shader_variants:
     - NAME: full
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
index d35492bc367..86a2229c416 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
@@ -42,7 +42,8 @@ layout(constant_id = 5) const int group_dim = 1;
 // work group will write into its assigned element in the shared array.
 #define MAX_NTHREADS 16
 
-shared vec4 shared_vecs[MAX_NTHREADS];
+shared vec4 shared_max[MAX_NTHREADS];
+shared vec4 shared_sum[MAX_NTHREADS];
 
 #include "indexing_utils.h"
 
@@ -102,13 +103,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     max_elements = max(max_elements, load_texel(tin, scan_pos));
   }
-  shared_vecs[smi] = max_elements;
+  shared_max[smi] = max_elements;
   barrier();
   // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
+  max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
+    max_elements = max(max_elements, shared_max[group_i]);
   }
 
   scan_pos[reduce_dim] = tid.x;
@@ -118,13 +119,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     denominators += exp(load_texel(tin, scan_pos) - max_elements);
   }
-  shared_vecs[smi] = denominators;
+  shared_sum[smi] = denominators;
   barrier();
   // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
+  denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
+    denominators += shared_sum[group_i];
   }
 
   // Determine if there are any padding elements in the final texel of the
@@ -184,13 +185,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
       max_elements.x = max(intex[i], max_elements.x);
     }
   }
-  shared_vecs[smi] = max_elements;
+  shared_max[smi] = max_elements;
   barrier();
   // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
+  max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
+    max_elements = max(max_elements, shared_max[group_i]);
   }
   // Each element of the texel is itself a partial maximum; iterate over the
   // texel to find the actual maximum
@@ -214,13 +215,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
       denominators.x += exp(intex[i] - max_element);
     }
   }
-  shared_vecs[smi] = denominators;
+  shared_sum[smi] = denominators;
   barrier();
   // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
+  denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
+    denominators += shared_sum[group_i];
   }
   // Reduce over the accumulated texel to find the overall sum
   float denominator = 0;
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
index 757afd06849..a6dd8f07f53 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -19,6 +19,18 @@
 
 namespace vkcompute {
 
+void resize_batch_norm_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(0);
+
+  // For batch norm, output dimensions are the same as input dimensions
+  std::vector<int64_t> new_out_sizes = graph->sizes_of(self);
+  graph->virtual_resize(out, new_out_sizes);
+}
+
 ValueRef check_and_prepack_arg(
     ComputeGraph& graph,
     ValueRef arg_ref,
@@ -101,7 +113,7 @@ void add_native_batch_norm_node(
       // Resize Args
       {},
       // Resizing Logic
-      nullptr));
+      resize_batch_norm_node));
 }
 
 void native_batch_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index 9ac4c963bc3..329620e80e6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -109,11 +109,15 @@ void add_permute_node(
   {
     IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
     const int32_t permute_ndim =
-        utils::safe_downcast<int>(permute_dims_ptr->size());
+        utils::safe_downcast<int32_t>(permute_dims_ptr->size());
 
     for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0;
          nchw_i--, whcn_i++) {
-      const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i);
+      int32_t permute_dim_nchw =
+          utils::safe_downcast<int32_t>(permute_dims_ptr->at(nchw_i));
+      if (permute_dim_nchw < 0) {
+        permute_dim_nchw += permute_ndim;
+      }
       const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw;
 
       whcn_permute_dims[whcn_i] = permute_dim_whcn;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index 250fcdd5490..879f59667d6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -137,7 +137,7 @@ void max_pool2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 
 struct DivisorParams final {
   int32_t divisor_override;
-  bool count_include_pad;
+  int32_t count_include_pad;
 };
 
 DivisorParams create_divisor_params(
@@ -148,7 +148,7 @@ DivisorParams create_divisor_params(
       graph.val_is_int(divisor_override)
           ? static_cast<int32_t>(graph.get_int(divisor_override))
           : 0,
-      graph.get_bool(count_include_pad)};
+      int32_t(graph.get_bool(count_include_pad))};
 }
 
 void add_avg_pool2d_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
index 13801b45cc7..e2b73b2f3f2 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
@@ -32,8 +32,13 @@ void add_squeeze_copy_dims_node(
   // 2. Squeeze outter most dim
   // For these cases, just pass input to output via clone.
   for (int i = 0; i < dims.size(); ++i) {
-    if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) {
-      squeeze_dims.push_back(dims.at(i));
+    // adjust negative dims
+    int64_t dim_val = dims.at(i);
+    if (dim_val < 0) {
+      dim_val += in_dim;
+    }
+    if (dims.at(i) != 0 && in_sizes.at(dim_val) == 1) {
+      squeeze_dims.push_back(dim_val);
     }
   }
   if (squeeze_dims.size() == 0) {
diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp
index d3efa81e52a..5bcf047aaf1 100644
--- a/backends/vulkan/runtime/vk_api/Exception.cpp
+++ b/backends/vulkan/runtime/vk_api/Exception.cpp
@@ -10,6 +10,13 @@
 
 #include <sstream>
 
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif // _GNU_SOURCE
+#include <boost/stacktrace.hpp>
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
+
 namespace vkcompute {
 namespace vkapi {
 
@@ -65,6 +72,11 @@ Error::Error(SourceLocation source_location, std::string msg)
   std::ostringstream oss;
   oss << "Exception raised from " << source_location_ << ": ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
@@ -74,6 +86,11 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
   oss << "Exception raised from " << source_location_ << ": ";
   oss << "(" << cond << ") is false! ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index a9ba62b6f9f..c48ce0a452b 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -19,6 +19,8 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
     default_flags = []
     android_flags = []
 
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     if not no_volk:
         for flags in [default_flags, android_flags]:
             flags.append("-DUSE_VULKAN_WRAPPER")
@@ -32,6 +34,10 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if link_moltenvk:
             mac_flags = []
 
+        if debug_mode:
+            mac_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+            default_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+
         VK_API_PREPROCESSOR_FLAGS += select({
             "DEFAULT": default_flags,
             "ovr_config//os:android": android_flags,
@@ -59,7 +65,6 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if etvk_default_cache_path != "":
             VK_API_PREPROCESSOR_FLAGS += ["-DETVK_DEFAULT_CACHE_PATH={}".format(etvk_default_cache_path)]
 
-        debug_mode = read_config("etvk", "debug", "0") == "1"
         if debug_mode:
             VK_API_PREPROCESSOR_FLAGS += ["-DVULKAN_DEBUG"]
 
@@ -136,6 +141,8 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = Fal
     )
 
 def define_common_targets(is_fbcode = False):
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     runtime.python_library(
         name = "gen_vulkan_spv_lib",
         srcs = [
@@ -200,6 +207,10 @@ def define_common_targets(is_fbcode = False):
                     "//third-party/khronos:moltenVK_static"
                 ]
 
+            if debug_mode:
+                mac_deps.append("fbsource//third-party/boost:boost")
+                default_deps.append("fbsource//third-party/boost:boost")
+
             VK_API_DEPS += select({
                 "DEFAULT": default_deps,
                 "ovr_config//os:android": android_deps,
diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS
index 53fad86f90c..ee296a4f68f 100644
--- a/backends/vulkan/test/TARGETS
+++ b/backends/vulkan/test/TARGETS
@@ -34,7 +34,6 @@ python_unittest(
     deps = [
         "//caffe2:torch",
         "//executorch/backends/vulkan/_passes:vulkan_passes",
-        "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
         "//executorch/backends/vulkan:vulkan_preprocess",
         "//pytorch/ao:torchao",  # @manual
     ]
diff --git a/backends/vulkan/test/scripts/test_model.sh b/backends/vulkan/test/scripts/test_model.sh
index 5f06d2c039b..40ec88bae70 100755
--- a/backends/vulkan/test/scripts/test_model.sh
+++ b/backends/vulkan/test/scripts/test_model.sh
@@ -111,6 +111,7 @@ build_core_libraries_and_devtools() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
diff --git a/backends/vulkan/test/scripts/test_op.sh b/backends/vulkan/test/scripts/test_op.sh
index 1ec07b7f75f..797089e54dc 100755
--- a/backends/vulkan/test/scripts/test_op.sh
+++ b/backends/vulkan/test/scripts/test_op.sh
@@ -138,6 +138,7 @@ build_core_libraries() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py
index 4a30ab6c2de..438126a179f 100644
--- a/backends/vulkan/test/test_vulkan_passes.py
+++ b/backends/vulkan/test/test_vulkan_passes.py
@@ -3,15 +3,8 @@
 
 import torch
 
-from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform
 from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
 
-from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
-    get_symmetric_quantization_config,
-    VulkanQuantizer,
-)
-
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
@@ -94,66 +87,6 @@ def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) ->
 
 
 class TestVulkanPasses(unittest.TestCase):
-    def test_fuse_int8pack_mm(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=8)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
-    def test_fuse_linear_qcs4w(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=4)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
     def test_fuse_rotary_emb(self):
         """Test conversion of rotary embedding pattern to et_vk.apply_rotary_emb custom op."""
 
@@ -238,7 +171,8 @@ def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor):
 
         # Apply the rotary embedding pass
         ep = edge_manager._edge_programs["forward"]
-        rotary_pass = FusePatternsPass(ep)
+        rotary_pass = FusePatternsPass()
+        rotary_pass._exported_program = ep
         result = rotary_pass.call(ep.graph_module)
 
         # Verify that the pass was successful
diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
index bfe4e9fceee..a887c53473a 100644
--- a/backends/vulkan/test/utils.py
+++ b/backends/vulkan/test/utils.py
@@ -90,7 +90,9 @@ def export_model_to_vulkan(
     qmode=QuantizationMode.NONE,
 ):
     compile_options = {}
-    exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode)
+    exported_graph = get_exported_graph(
+        model, sample_inputs, dynamic_shapes=dynamic_shapes, qmode=qmode
+    )
     program = export(
         exported_graph,
         sample_inputs,
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index 972a4f26c1b..09c57f649ae 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -128,7 +128,7 @@ def is_param_node(program: ExportedProgram, node: torch.fx.Node) -> bool:
         is_get_attr_node(node)
         or is_param(program, node)
         or is_buffer(program, node)
-        or is_constant(program, node)
+        or is_lifted_tensor_constant(program, node)
     )
 
 
@@ -206,6 +206,8 @@ def is_tensor_arg_node(node: Any) -> bool:
     if isinstance(node, torch.fx.Node):
         return is_tensor_node(node)
     elif isinstance(node, (list, tuple)):
+        if len(node) == 0:
+            return False
         return all(is_tensor_node(n) for n in node)
 
     return False
@@ -1228,6 +1230,16 @@ def is_in_8bit_range(tensor: torch.Tensor) -> bool:
 ##
 
 
+def nchw_dim_to_whcn_dim(nchw_dim: int, ndim: int) -> int:
+    # Handle negative indices for nchw_dim
+    if nchw_dim < 0:
+        nchw_dim += ndim
+
+    assert nchw_dim >= 0 and nchw_dim < ndim
+    whcn_dim = (ndim - 1) - nchw_dim
+    return whcn_dim
+
+
 def get_tensor_val_str(tensor_val: FakeTensor) -> str:
     return f"{tensor_val.dtype}: {tensor_val.shape}"
 
@@ -1279,6 +1291,7 @@ def update_program_state_dict(
     updated_tensor: torch.Tensor,
 ) -> None:
     target_name = None
+    kind = None
     # Iterate over all the tensors in the graph signature, and find
     # the one corresponding to the parameter/buffer name
     for input_ in program.graph_signature.input_specs:
@@ -1287,6 +1300,7 @@ def update_program_state_dict(
             and isinstance(input_.arg, TensorArgument)
             and input_.arg.name == buffer_name
         ):
+            kind = input_.kind
             target_name = input_.target
             break
 
@@ -1296,6 +1310,9 @@ def update_program_state_dict(
     ), f"could not find {buffer_name} in source program signature"
     assert target_name in program.state_dict, f"could not find {target_name}"
 
+    if kind == InputKind.PARAMETER:
+        updated_tensor = torch.nn.Parameter(updated_tensor, requires_grad=False)
+
     # Finally, overwrite the current tensor with updated tensor
     program.state_dict[target_name] = updated_tensor
 
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 2f91d97ff58..876f7fa8900 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -8,7 +8,7 @@
 
 from functools import partial
 
-from typing import Any, Dict, final, List
+from typing import Any, Callable, Dict, final, List
 
 import executorch.backends.vulkan.utils as utils
 
@@ -56,7 +56,9 @@
 
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
-from executorch.exir.program._program import _copy_module
+from executorch.exir.program._program import _transform
+
+from torch._export.verifier import Verifier
 
 from torch.export._remove_auto_functionalized_pass import (
     unsafe_remove_auto_functionalized_pass,
@@ -65,28 +67,34 @@
 DEFAULT_DEBUG_HANDLE = 65535
 
 
+class _any_op(Verifier):
+    # Set training dialect to skip functional check in base verifier
+    dialect = "TRAINING"
+
+    def allowed_op_types(self):
+        return (Callable,)
+
+
 # pyre-ignore
 def apply_passes(program: ExportedProgram, passes) -> ExportedProgram:
     for p in passes:
-        if issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
-            new_gm = program.graph_module
-            # This is a workaround to allow the memory planning pass to work without
-            # having to first apply ToOutVarPass(). See the `greedy()` function in
-            # `exir.memory_planning`; if this attribute isn't set, assertions in
-            # `collect_spec_from_nodes()` will fail.
-            if isinstance(p, MemoryPlanningPass):
-                new_gm.encounter_to_out_var_failure = True
-
-            new_gm_res = p(new_gm)
-            assert new_gm_res is not None
-            new_gm = new_gm_res.graph_module
-
+        if isinstance(p, MemoryPlanningPass) and hasattr(p, "run"):
+            p.run(program.graph_module)
+
+        elif issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
+            # Some passes require the ep to be provided. However, since the ep may be
+            # updated with each pass applied, the ep must be set right before calling
+            # the pass. _exported_program is the attribute used by XNNPACK and Vulkan
+            # passes to store the exported program.
+            if hasattr(p, "_exported_program"):
+                p._exported_program = program
+
+            program = _transform(program, p, override_verifiers=[_any_op])
             # See the application of this function in exir/program/_program.py for more
             # details on why this step is necessary.
             if isinstance(p, SpecPropPass):
-                p.update_placeholder_tensor_specs(program, new_gm)
+                p.update_placeholder_tensor_specs(program, program.graph_module)
 
-            _copy_module(program.graph_module, new_gm)
         else:
             program = p(program)
 
@@ -159,17 +167,17 @@ def preprocess(  # noqa: C901
         program = apply_passes(
             program,
             [
-                FusePatternsPass(program),
-                RemoveRedundantOpsTransform(),
+                FuseBatchNormPass(program),
+                FusePatternsPass(),
+                FuseClampPass(),
                 AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(program),
+                RemoveRedundantOpsTransform(),
+                FuseQuantizedOpsTransform(),
                 ReplaceQDQPass(),
-                FoldQDQPass(program),
+                FoldQDQPass(),
                 SqueezeUnsqueezeInputs(),
                 FuseViewCopyTransform(),
                 ViewCopyToSqueezeUnsqueezePass(),
-                FuseBatchNormPass(program),
-                FuseClampPass(),
             ],
         )
 
@@ -215,6 +223,11 @@ def preprocess(  # noqa: C901
         mem_planning_suite = MemoryPlanningAlgorithmSuite(
             algo_list=[greedy_memory_planning]
         )
+        # This is a workaround to allow the memory planning pass to work without having
+        # to first apply ToOutVarPass(). See the `greedy()` function in
+        # `exir.memory_planning`; if this attribute isn't set, assertions in
+        # `collect_spec_from_nodes()` will fail.
+        program.graph_module.encounter_to_out_var_failure = True
         program = apply_passes(
             program,
             [
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 85e9889ca36..c1bc3a54f7c 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -110,7 +110,9 @@ def is_nhwc_node(node: torch.fx.Node) -> bool:
             if len(quantize_node.all_input_nodes) > 0:
                 actual_node = quantize_node.args[0]
                 if actual_node.op == "placeholder":
-                    return not actual_node.meta["val"][0].is_contiguous()
+                    return ChannelsLastTaggedReshapePass._is_nhwc_tensor(
+                        actual_node.meta["val"][0]
+                    )
                 else:
                     return actual_node.meta.get(
                         ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
@@ -125,7 +127,9 @@ def is_nchw_node(node: torch.fx.Node) -> bool:
             if len(quantize_node.all_input_nodes) > 0:
                 actual_node = quantize_node.args[0]
                 if actual_node.op == "placeholder":
-                    return actual_node.meta["val"][0].is_contiguous()
+                    return not ChannelsLastTaggedReshapePass._is_nhwc_tensor(
+                        actual_node.meta["val"][0]
+                    )
                 else:
                     return not actual_node.meta.get(
                         ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
@@ -133,6 +137,26 @@ def is_nchw_node(node: torch.fx.Node) -> bool:
 
         return not ChannelsLastTaggedReshapePass.is_nhwc_node(node)
 
+    @staticmethod
+    def _is_nhwc_tensor(tensor: torch.Tensor) -> bool:
+        nhwc = tensor.is_contiguous(memory_format=torch.channels_last)
+        nchw = tensor.is_contiguous()
+        # if both are true false
+        # if  both nchw and nhwc are true
+        #     then we want to see this is nchw hence return false
+        # if either of nchw or nhwc is false, then just rely on hwc
+        # if both are false, mayb channels_last_3d, then return nhwc
+        #    however this should not happen here
+        # return (not (nchw and nhwc)) and nhwc
+        # Readable version
+        if nchw and nhwc:
+            return False
+        else:
+            return nhwc
+
+    def _is_nhwc(self, tensor: torch.Tensor) -> bool:
+        return ChannelsLastTaggedReshapePass._is_nhwc_tensor(tensor)
+
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
         return node.target in self.memory_sensitive_ops_nhwc
 
@@ -315,11 +339,8 @@ def input_dim_order(
         self, input_node: torch.fx.Node, input_order: InputDimOrder
     ) -> bool:
         if input_node.op == "placeholder":
-            return (
-                input_node.meta["val"].is_contiguous()
-                if input_order == InputDimOrder.NCHW
-                else not input_node.meta["val"].is_contiguous()
-            )
+            is_nhwc = self._is_nhwc(input_node.meta["val"])
+            return not is_nhwc if input_order == InputDimOrder.NCHW else is_nhwc
         else:
             return (
                 ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
@@ -348,7 +369,7 @@ def input_to_nhwc(
             self.mark_as_nhwc_node(input_node)
 
         if input_node.op == "placeholder":
-            if not input_node.meta["val"][0].is_contiguous():
+            if self._is_nhwc(input_node.meta["val"][0]):
                 return
         elif ChannelsLastTaggedReshapePass.is_nhwc_node(input_node):
             return
@@ -420,7 +441,7 @@ def input_to_nchw(
             self.mark_as_nchw_node(input_node)
 
         if input_node.op == "placeholder":
-            if input_node.meta["val"].is_contiguous():
+            if not self._is_nhwc(input_node.meta["val"]):
                 return
         elif ChannelsLastTaggedReshapePass.is_nchw_node(input_node):
             return
@@ -462,17 +483,17 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                     and isinstance(node.meta["val"], torch.Tensor)
                     and len(node.meta["val"].shape) == 4
                 ):
-                    if node.meta["val"].is_contiguous():
-                        self.mark_as_nchw_node(node)
-                    else:
+                    if self._is_nhwc(node.meta["val"]):
                         self.mark_as_nhwc_node(node)
+                    else:
+                        self.mark_as_nchw_node(node)
                 continue
 
             # Need special case for output node because it can have multiple output dim orders as we can output a tuple multiple nodes
             if node.op == "output":
                 out_tuple = node.args[0]
                 for out_node in out_tuple:
-                    if out_node.meta["val"].is_contiguous():
+                    if not self._is_nhwc(out_node.meta["val"]):
                         self.input_to_nchw(graph_module, out_node, node)
                     else:
                         self.input_to_nhwc(graph_module, out_node, node)
diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
index d17b7abd6a1..93424b1c84d 100644
--- a/backends/xnnpack/operators/__init__.py
+++ b/backends/xnnpack/operators/__init__.py
@@ -41,6 +41,7 @@
     op_relu,
     op_rsqrt,
     op_sigmoid,
+    op_sin,
     op_skip_ops,
     op_slice_copy,
     op_softmax,
diff --git a/backends/xnnpack/operators/op_sin.py b/backends/xnnpack/operators/op_sin.py
new file mode 100644
index 00000000000..56fe9396103
--- /dev/null
+++ b/backends/xnnpack/operators/op_sin.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNGraph,
+    XNNSin,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class SinVisitor(NodeVisitor):
+    target = "aten.sin.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input
+        input_id = vals_to_ids[get_input_node(node, 0)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNSin(
+                input_id=input_id,
+                output_id=output_id,
+                flags=0,
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
index e393f1c9ac8..86baba3e3f7 100644
--- a/backends/xnnpack/partition/config/__init__.py
+++ b/backends/xnnpack/partition/config/__init__.py
@@ -45,6 +45,7 @@
     ReciprocalSquareRootConfig,
     ReLUConfig,
     SigmoidConfig,
+    SinConfig,
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
@@ -105,6 +106,7 @@
     TanhConfig,
     ToDimOrderCopyConfig,
     SigmoidConfig,
+    SinConfig,
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
index 559d1522275..06024c632c9 100644
--- a/backends/xnnpack/partition/config/generic_node_configs.py
+++ b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -636,3 +636,10 @@ class BMMConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
+
+
+class SinConfig(GenericNodePartitionerConfig):
+    target_name = "sin.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index eb9b668dafa..b71ab08ea45 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1690,6 +1690,7 @@ _DEFINE_UNARY_NODE_NO_PARAMS(Log, xnn_unary_log)
 _DEFINE_UNARY_NODE_NO_PARAMS(Negate, xnn_unary_negate)
 _DEFINE_UNARY_NODE_NO_PARAMS(Square, xnn_unary_square)
 _DEFINE_UNARY_NODE_NO_PARAMS(Abs, xnn_unary_abs)
+_DEFINE_UNARY_NODE_NO_PARAMS(Sin, xnn_unary_sine)
 
 // Unary Ops with min/max params
 _DEFINE_UNARY_NODE_WITH_MINMAX(Clamp, xnn_unary_clamp)
@@ -1737,6 +1738,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Floor)
     _DEFINE(PReLU)
     _DEFINE(Sigmoid)
+    _DEFINE(Sin)
 
     // Others
     _DEFINE(FullyConnected)
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index 950318f18dc..239f92d899e 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -156,6 +156,7 @@ union XNodeUnion {
   XNNGelu: _XNNNode1x1,
   XNNTanh: _XNNNode1x1,
   XNNExp: _XNNNode1x1,
+  XNNSin: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index a4efc627cbb..92a61c5537b 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -152,6 +152,7 @@ union XNodeUnion {
   XNNGelu: _XNNNode1x1,
   XNNTanh: _XNNNode1x1,
   XNNExp: _XNNNode1x1,
+  XNNSin: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
index 99b64708f86..2b3f8e74202 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_schema.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -347,6 +347,11 @@ class XNNPReLU(XNNNode2x1):
     pass
 
 
+@dataclass
+class XNNSin(XNNNode1x1):
+    pass
+
+
 @dataclass
 class XNNScaledDotProductAttention:
     query_id: int
@@ -402,6 +407,8 @@ class XNNScaledDotProductAttention:
     XNNLog,
     XNNGelu,
     XNNTanh,
+    XNNExp,
+    XNNSin,
 ]
 
 
diff --git a/backends/xnnpack/test/ops/test_sin.py b/backends/xnnpack/test/ops/test_sin.py
new file mode 100644
index 00000000000..6a1b323e14c
--- /dev/null
+++ b/backends/xnnpack/test/ops/test_sin.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestSin(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
+    class Sin(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            z = torch.sin(x)
+            return z
+
+    def _test_sin(self, inputs, legacy_mode: bool = False):
+        tester = (
+            Tester(self.Sin(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.sin.default": 1})
+        )
+
+        if legacy_mode:
+            tester = tester.to_edge().partition()
+        else:
+            tester = tester.to_edge_transform_and_lower()
+
+        (
+            tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_sin_default"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp16_sin(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ).to(torch.float16),
+        )
+        self._test_sin(inputs, legacy_mode=False)
+
+    def test_fp16_sin_legacy_mode(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ).to(torch.float16),
+        )
+        self._test_sin(inputs, legacy_mode=True)
+
+    def test_fp32_sin(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ),
+        )
+        self._test_sin(inputs, legacy_mode=False)
+
+    def test_fp32_sin_legacy_mode(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ),
+        )
+        self._test_sin(inputs, legacy_mode=True)
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 05fb53a837d..cdceb8a90a1 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -71,6 +71,11 @@ def generate_node_to_external_map(
         if node.op == "output":
             for output_nodes in node.args:
                 for output_node in output_nodes:
+                    if output_node in node_to_external_map:
+                        raise RuntimeError(
+                            f"Output node '{output_node}' is already in the inputs. "
+                            "This is likely due to pass through arguments, which are not supported in XNNPACK Delegate."
+                        )
                     node_to_external_map[output_node] = ExternalMeta(
                         external_id=len(node_to_external_map),
                         io_type=XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index fa5412ac476..fb154ff88bc 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -63,6 +63,6 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   install(
     TARGETS optimized_native_cpu_ops_lib
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
diff --git a/docs/README.md b/docs/README.md
index e30decb9362..845267b32f6 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -43,7 +43,7 @@ To build the documentation locally:
    git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
    ```
 
-1. If you don't have it already, start either a Python virtual envitonment:
+1. If you don't have it already, start either a Python virtual environment:
 
    ```bash
    python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
@@ -111,7 +111,7 @@ You can use the variables in both regular text and code blocks.
 ## Including READMEs to the Documentation Build
 
 You might want to include some of the `README.md` files from various directories
-in this repositories in your documentation build. To do that, create an `.md`
+in this repository in your documentation build. To do that, create an `.md`
 file and use the `{include}` directive to insert your `.md` files. Example:
 
 ````
@@ -177,7 +177,7 @@ file:
 ````
 
 In the `index.md` file, I would add `tutorials/selective-build-tutorial` in
-both the `toctree` and the `cusotmcarditem` sections.
+both the `toctree` and the `customcarditem` sections.
 
 # Auto-generated API documentation
 
diff --git a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg b/docs/source/_static/img/ExecuTorch-Logo-cropped.svg
deleted file mode 100644
index 9e0ef52fbd8..00000000000
--- a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   id="Layer_2"
-   viewBox="0 0 51.200001 38.52"
-   width="51.200001"
-   height="38.52"
-   version="1.1"
-   sodipodi:docname="ExecuTorch-Logo-cropped.svg"
-   inkscape:version="1.2.1 (9c6d41e4, 2022-07-14)"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:svg="http://www.w3.org/2000/svg">
-  <sodipodi:namedview
-     id="namedview15"
-     pagecolor="#ffffff"
-     bordercolor="#000000"
-     borderopacity="0.25"
-     inkscape:showpageshadow="2"
-     inkscape:pageopacity="0.0"
-     inkscape:pagecheckerboard="0"
-     inkscape:deskcolor="#d1d1d1"
-     showgrid="false"
-     inkscape:zoom="8.0613964"
-     inkscape:cx="18.235054"
-     inkscape:cy="7.6289512"
-     inkscape:window-width="1680"
-     inkscape:window-height="819"
-     inkscape:window-x="0"
-     inkscape:window-y="25"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="Layer_2" />
-  <defs
-     id="defs4">
-    <style
-       id="style2">.cls-1{fill:#cc2faa;}</style>
-  </defs>
-  <path
-     class="cls-1"
-     d="m 26.89,12.15 c 1.27,-1.27 3.33,-1.27 4.59,0 1.26,1.27 1.26,3.32 0,4.59 -1.26,1.27 -3.33,1.27 -4.59,0 -1.26,-1.27 -1.26,-3.32 0,-4.59"
-     id="path6" />
-  <polygon
-     class="cls-1"
-     points="16.1,27.25 16.11,21.52 39.95,45.19 51.49,45.17 51.53,22.49 55.6,18.42 55.55,49.23 38.27,49.26 "
-     id="polygon8"
-     transform="translate(-4.4,-10.74)" />
-  <polygon
-     class="cls-1"
-     points="4.4,41.62 4.45,10.77 21.74,10.74 30.38,19.31 27.5,22.19 20.05,14.81 8.52,14.83 8.48,37.55 "
-     id="polygon10"
-     transform="translate(-4.4,-10.74)" />
-  <polygon
-     class="cls-1"
-     points="39.52,28.41 44.48,33.33 44.47,39.06 36.66,31.31 "
-     id="polygon12"
-     transform="translate(-4.4,-10.74)" />
-</svg>
diff --git a/docs/source/_static/img/executorch-chip-logo-circle-16.png b/docs/source/_static/img/executorch-chip-logo-circle-16.png
new file mode 100644
index 00000000000..a3966ae27db
Binary files /dev/null and b/docs/source/_static/img/executorch-chip-logo-circle-16.png differ
diff --git a/docs/source/_static/img/executorch-chip-logo-circle-32.png b/docs/source/_static/img/executorch-chip-logo-circle-32.png
new file mode 100644
index 00000000000..83f1018a76c
Binary files /dev/null and b/docs/source/_static/img/executorch-chip-logo-circle-32.png differ
diff --git a/docs/source/_static/img/executorch-chip-logo.svg b/docs/source/_static/img/executorch-chip-logo.svg
new file mode 100644
index 00000000000..11e5ed60956
--- /dev/null
+++ b/docs/source/_static/img/executorch-chip-logo.svg
@@ -0,0 +1,205 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="45.129288mm"
+   height="45.129242mm"
+   viewBox="0 0 45.129288 45.129242"
+   version="1.1"
+   id="svg1124"
+   inkscape:version="1.2.1 (9c6d41e, 2022-07-14)"
+   sodipodi:docname="executorch-chip-logo.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview1126"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="2.3786088"
+     inkscape:cx="20.600277"
+     inkscape:cy="32.161657"
+     inkscape:current-layer="layer1" />
+  <defs
+     id="defs1121">
+    <linearGradient
+       id="linearGradient2449"
+       inkscape:swatch="solid">
+      <stop
+         style="stop-color:#ffffff;stop-opacity:1;"
+         offset="0"
+         id="stop2447" />
+    </linearGradient>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-15.818847,-61.123938)">
+    <g
+       id="g2797"
+       transform="matrix(0.90140816,0,0,0.90140816,3.7842987,8.2510089)">
+      <rect
+         style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+         id="rect2207"
+         width="28.699108"
+         height="28.699108"
+         x="-46.385078"
+         y="71.985069"
+         ry="2.4088593"
+         rx="2.4088593"
+         transform="rotate(-45)" />
+      <g
+         id="g2221"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2209"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1381.7832 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2211"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1400.6595 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2213"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1419.5359 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7655 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7167 z" />
+        <path
+           id="path2215"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1438.4121 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7167 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9513 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2217"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1457.2883 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7658 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2219"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1362.907 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7167 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9513 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+      </g>
+      <g
+         id="g2235"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2223"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -444.51285,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2225"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -425.63655,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2227"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -406.76015,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7655,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7167,1.7168 z" />
+        <path
+           id="path2229"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -387.88395,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7167,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9513,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2231"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -369.00775,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7658,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2233"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -463.38905,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7167,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9513,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+      </g>
+      <g
+         id="g2249"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2237"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1450.1533 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2239"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1431.277 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2241"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1412.4006 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7655 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7167 z" />
+        <path
+           id="path2243"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1393.5244 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7167 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9513 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2245"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1374.6482 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7658 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2247"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1469.0295 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7167 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9513 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+      </g>
+      <g
+         id="g2263"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2251"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -375.99123,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2253"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -394.86753,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2255"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -413.74393,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7655,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7167,-1.7168 z" />
+        <path
+           id="path2257"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -432.62013,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7167,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9513,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2259"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -451.49633,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7658,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2261"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -357.11503,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7167,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9513,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+      </g>
+      <rect
+         style="font-variation-settings:normal;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#ffffff;stroke-width:0.814388;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+         id="rect2265"
+         width="23.585829"
+         height="23.585829"
+         x="-43.828445"
+         y="74.54174"
+         ry="0.11391187"
+         rx="0.11391187"
+         transform="rotate(-45)" />
+      <g
+         id="g2205"
+         transform="matrix(0.21958723,0,0,0.21958723,28.325015,-164.37637)"
+         style="display:inline;fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049">
+        <path
+           fill="#ee4c2c"
+           d="m 77.6,1099.6 -8.1,8.1 c 13.3,13.3 13.3,34.7 0,47.8 -13.3,13.3 -34.7,13.3 -47.8,0 -13.3,-13.3 -13.3,-34.7 0,-47.8 v 0 l 21.1,-21.1 3,-3 v 0 -15.9 L 14,1099.5 c -17.7,17.7 -17.7,46.3 0,64 17.7,17.7 46.3,17.7 63.7,0 17.6,-17.7 17.6,-46.1 -0.1,-63.9 z"
+           id="path2201"
+           style="fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049" />
+        <circle
+           fill="#ee4c2c"
+           cx="61.700001"
+           cy="1091.8"
+           r="5.9000001"
+           id="circle2203"
+           style="fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049" />
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/source/advanced-topics-section.md b/docs/source/advanced-topics-section.md
new file mode 100644
index 00000000000..e7b7f5490c6
--- /dev/null
+++ b/docs/source/advanced-topics-section.md
@@ -0,0 +1,112 @@
+(advanced-topics-section)=
+
+# Advanced
+
+Deep dive into ExecuTorch's advanced features for optimization, customization, and integration.
+
+This section covers advanced concepts for developers who need to customize ExecuTorch for specific use cases, optimize performance, or integrate with custom hardware backends.
+
+## Quantization & Optimization
+
+Techniques for model compression and performance optimization.
+
+**→ {doc}`quantization-optimization` — Quantization strategies and performance optimization**
+
+Key topics:
+
+- Quantization strategies and techniques
+- Performance profiling and optimization
+
+## Model Export
+
+Learn the core ExecuTorch workflow, exporting PyTorch models to the `.pte` format for edge deployment.
+
+**→ {doc}`using-executorch-export`** - Model Export & Lowering
+
+Key topics:
+
+- Export and Lowering Workflow
+- Hardware Backend Selection & Optimization
+- Dynamic Shapes & Advanced Model Features
+
+
+## Kernel Library
+
+Deep dive into ExecuTorch's kernel implementation and customization.
+
+**→ {doc}`kernel-library-advanced` — Kernel library deep dive and customization**
+
+Key topics:
+
+- Kernel library architecture
+- Custom kernel implementation
+- Selective build and optimization
+
+## Backend & Delegates
+
+**→ {doc}`backend-delegate-advanced` — Backend delegate integration**
+
+Key topics:
+
+- Learn how to integrate Backend Delegate into ExecuTorch and more
+- XNNPACK Delegate Internals
+- Debugging Delegation
+
+
+## Runtime & Integration
+
+Advanced runtime features and backend integration.
+
+**→ {doc}`runtime-integration-advanced` — Runtime customization and backend integration**
+
+Key topics:
+
+- Backend delegate implementation
+- Platform abstraction layer
+- Custom runtime integration
+
+## Compiler & IR
+
+Advanced compiler features and intermediate representation details.
+
+**→ {doc}`compiler-ir-advanced` — Compiler passes and IR specification**
+
+Key topics:
+
+- Custom compiler passes
+- Memory planning strategies
+- Backend dialect and EXIR
+- Ops set definition
+
+
+## File Formats
+
+ExecuTorch file format specifications and internals.
+
+**→ {doc}`file-formats-advanced` — PTE and PTD file format specifications**
+
+Key topics:
+
+- PTE file format internals
+- PTD file format specification
+- Custom file format handling
+
+## Next Steps
+
+After exploring advanced topics:
+
+- **{doc}`tools-sdk-section`** - Developer tools for debugging and profiling
+- **{doc}`api-section`** - Complete API reference documentation
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Advanced Topics
+
+quantization-optimization
+using-executorch-export
+kernel-library-advanced
+backend-delegate-advanced
+runtime-integration-advanced
+compiler-ir-advanced
+file-formats-advanced
diff --git a/docs/source/android-arm-vgf.md b/docs/source/android-arm-vgf.md
new file mode 100644
index 00000000000..cc39b53e176
--- /dev/null
+++ b/docs/source/android-arm-vgf.md
@@ -0,0 +1 @@
+```{include} backends-arm-vgf.md
diff --git a/docs/source/android-backends.md b/docs/source/android-backends.md
new file mode 100644
index 00000000000..d506813990b
--- /dev/null
+++ b/docs/source/android-backends.md
@@ -0,0 +1,28 @@
+(android-backends)=
+# Backends
+
+Available hardware acceleration backends for Android deployment.
+
+## CPU Acceleration
+
+- {doc}`android-xnnpack` — XNNPACK CPU acceleration
+
+## GPU Acceleration
+
+- {doc}`android-vulkan` — Vulkan GPU acceleration
+
+## NPU/Accelerator Backends
+
+- {doc}`android-qualcomm` — Qualcomm AI Engine (NPU)
+- {doc}`android-mediatek` — MediaTek NPU acceleration
+- {doc}`android-arm-vgf` — ARM VGF Backend
+- {doc}`android-samsung-exynos` — Samsung Exynos NPU
+
+```{toctree}
+:hidden:
+android-xnnpack
+android-vulkan
+android-qualcomm
+android-mediatek
+android-arm-vgf
+android-samsung-exynos
diff --git a/docs/source/android-examples.md b/docs/source/android-examples.md
new file mode 100644
index 00000000000..65580870c57
--- /dev/null
+++ b/docs/source/android-examples.md
@@ -0,0 +1,9 @@
+# Examples & Demos
+
+- [Working with LLMs - Android Examples](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
+- [Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
+- {doc}`tutorial-arm-vgf` — Export a simple PyTorch model for the ExecuTorch VGF backend
+
+```{toctree}
+:hidden:
+tutorial-arm-vgf
diff --git a/docs/source/android-mediatek.md b/docs/source/android-mediatek.md
new file mode 100644
index 00000000000..7034fe439dd
--- /dev/null
+++ b/docs/source/android-mediatek.md
@@ -0,0 +1 @@
+```{include} backends-mediatek.md
diff --git a/docs/source/android-qualcomm.md b/docs/source/android-qualcomm.md
new file mode 100644
index 00000000000..f484d771a8b
--- /dev/null
+++ b/docs/source/android-qualcomm.md
@@ -0,0 +1 @@
+```{include} backends-qualcomm.md
diff --git a/docs/source/android-samsung-exynos.md b/docs/source/android-samsung-exynos.md
new file mode 100644
index 00000000000..4c5a470edca
--- /dev/null
+++ b/docs/source/android-samsung-exynos.md
@@ -0,0 +1 @@
+```{include} backends-samsung-exynos.md
diff --git a/docs/source/android-section.md b/docs/source/android-section.md
new file mode 100644
index 00000000000..a5774352bc1
--- /dev/null
+++ b/docs/source/android-section.md
@@ -0,0 +1,23 @@
+(android-section)=
+
+# Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-android` — Complete Android integration guide
+
+## Backends
+
+- {doc}`android-backends` — Available Android backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`android-examples` — Explore Android Examples & Demos
+
+```{toctree}
+:hidden:
+using-executorch-android
+android-backends
+android-examples
diff --git a/docs/source/android-vulkan.md b/docs/source/android-vulkan.md
new file mode 100644
index 00000000000..6399ac4ec7c
--- /dev/null
+++ b/docs/source/android-vulkan.md
@@ -0,0 +1 @@
+```{include} backends-vulkan.md
diff --git a/docs/source/android-xnnpack.md b/docs/source/android-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/android-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/api-section.md b/docs/source/api-section.md
new file mode 100644
index 00000000000..ab2573aefa9
--- /dev/null
+++ b/docs/source/api-section.md
@@ -0,0 +1,26 @@
+(api-section)=
+# API
+
+In this section, find complete API documentation for ExecuTorch's export, runtime, and extension interfaces. Includes comprehensive references for Python, C++, and Java APIs across all supported platforms.
+
+- {doc}`export-to-executorch-api-reference` — Export to ExecuTorch API Reference
+- {doc}`executorch-runtime-api-reference` — ExecuTorch Runtime API Reference
+- {doc}`runtime-python-api-reference` — Runtime Python API Reference
+- {doc}`api-life-cycle` — API Life Cycle
+- [Android doc →](https://pytorch.org/executorch/main/javadoc/) — Android API Documentation
+- {doc}`extension-module` — Extension Module
+- {doc}`extension-tensor` — Extension Tensor
+- {doc}`running-a-model-cpp-tutorial` — Detailed C++ Runtime APIs Tutorial
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: API Reference
+
+export-to-executorch-api-reference
+executorch-runtime-api-reference
+runtime-python-api-reference
+api-life-cycle
+extension-module
+extension-tensor
+running-a-model-cpp-tutorial
diff --git a/docs/source/api.md b/docs/source/api.md
deleted file mode 100644
index 4f6160d258a..00000000000
--- a/docs/source/api.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# API
-
-```{toctree}
-:maxdepth: 1
-
-export-to-executorch-api-reference
-executorch-runtime-api-reference
-runtime-python-api-reference
-api-life-cycle
-Javadoc <https://pytorch.org/executorch/main/javadoc/>
-```
diff --git a/docs/source/backend-delegate-advanced.md b/docs/source/backend-delegate-advanced.md
new file mode 100644
index 00000000000..752bd1cdc02
--- /dev/null
+++ b/docs/source/backend-delegate-advanced.md
@@ -0,0 +1,33 @@
+(backend-delegate-advanced)=
+
+# Backend & Delegates
+
+## Integration
+
+- {doc}`backend-delegates-integration` — Learn how to integrate a backend delegate into ExecuTorch
+
+## XNNPACK Reference
+
+- {doc}`backend-delegates-xnnpack-reference` — Deep dive into XNNPACK delegate internals and implementation details
+
+## Dependency Management
+
+- {doc}`backend-delegates-dependencies` — Manage third-party dependencies for backend delegates
+
+## Overview
+
+- {doc}`compiler-delegate-and-partitioner` — Understanding backends, delegates, and the partitioner system
+
+## Debugging
+
+- {doc}`debug-backend-delegate` — Tools and techniques for debugging delegation issues
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+backend-delegates-integration
+backend-delegates-xnnpack-reference
+backend-delegates-dependencies
+compiler-delegate-and-partitioner
+debug-backend-delegate
diff --git a/docs/source/backend-delegates-xnnpack-reference.md b/docs/source/backend-delegates-xnnpack-reference.md
index cfb915aca59..8b4338e703c 100644
--- a/docs/source/backend-delegates-xnnpack-reference.md
+++ b/docs/source/backend-delegates-xnnpack-reference.md
@@ -70,7 +70,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
+We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
diff --git a/docs/source/backends-arm-ethos-u.md b/docs/source/backends-arm-ethos-u.md
index 4b4cd625d6e..2dfddacd20f 100644
--- a/docs/source/backends-arm-ethos-u.md
+++ b/docs/source/backends-arm-ethos-u.md
@@ -1,7 +1,7 @@
 # Arm&reg; Ethos&trade;-U NPU Backend
 
 The Arm&reg; Ethos&trade;-U backend targets Edge/IoT-type AI use-cases by enabling optimal execution of quantized models on
-[Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
+[Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
 [Arm&reg; Ethos&trade;-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85), leveraging [TOSA](https://www.mlplatform.org/tosa/) and the
 [ethos-u-vela](https://pypi.org/project/ethos-u-vela/) graph compiler. This document is a technical reference for using the Ethos-U backend, for a top level view with code examples
 please refer to the [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html).
@@ -282,4 +282,4 @@ full network is converted to use channels last. A word of caution must be given
 unsupported ops being inserted into the graph, and it is currently not widely tested, so the feature must so far be viewed as experimental.
 
 ## See Also
-- [Arm Ethos-U Backend Tutorial](tutorial-arm.md)
\ No newline at end of file
+- [Arm Ethos-U Backend Tutorial](tutorial-arm-ethos-u.md)
\ No newline at end of file
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
index fe6748617a0..3ab0d3d3435 100644
--- a/docs/source/backends-coreml.md
+++ b/docs/source/backends-coreml.md
@@ -61,7 +61,7 @@ The Core ML partitioner API allows for configuration of the model delegation to
  - `skip_ops_for_coreml_delegation`: Allows you to skip ops for delegation by Core ML.  By default, all ops that Core ML supports will be delegated.  See [here](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/test/test_coreml_partitioner.py#L42) for an example of skipping an op for delegation.
 - `compile_specs`: A list of `CompileSpec`s for the Core ML backend.  These control low-level details of Core ML delegation, such as the compute unit (CPU, GPU, ANE), the iOS deployment target, and the compute precision (FP16, FP32).  These are discussed more below.
 - `take_over_mutable_buffer`: A boolean that indicates whether PyTorch mutable buffers in stateful models should be converted to [Core ML `MLState`](https://developer.apple.com/documentation/coreml/mlstate).  If set to `False`, mutable buffers in the PyTorch graph are converted to graph inputs and outputs to the Core ML lowered module under the hood.  Generally, setting `take_over_mutable_buffer` to true will result in better performance, but using `MLState` requires iOS >= 18.0, macOS >= 15.0, and Xcode >= 16.0.
-- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By deafault, take_over_constant_data=True.
+- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By default, take_over_constant_data=True.
 - `lower_full_graph`: A boolean that indicates whether the entire graph must be lowered to Core ML.  If set to True and Core ML does not support an op, an error is raised during lowering.  If set to False and Core ML does not support an op, the op is executed on the CPU by ExecuTorch.  Although setting `lower_full_graph`=False can allow a model to lower where it would otherwise fail, it can introduce performance overhead in the model when there are unsupported ops.  You will see warnings about unsupported ops during lowering if there are any.  By default, `lower_full_graph`=False.
 
 
@@ -187,7 +187,7 @@ To quantize a PyTorch model for the Core ML backend, use the `CoreMLQuantizer`.
 Quantization with the Core ML backend requires exporting the model for iOS 17 or later.
 To perform 8-bit quantization with the PT2E flow, follow these steps:
 
-1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use to to create an instance of a `CoreMLQuantizer`.
+1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use it to create an instance of a `CoreMLQuantizer`.
 2) Use `torch.export.export` to export a graph module that will be prepared for quantization.
 3) Call `prepare_pt2e` to prepare the model for quantization.
 4) Run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
@@ -386,4 +386,4 @@ If you're using Python 3.13, try reducing your python version to Python 3.12.  c
 ### At runtime
 1. [ETCoreMLModelCompiler.mm:55] [Core ML]  Failed to compile model, error = Error Domain=com.apple.mlassetio Code=1 "Failed to parse the model specification. Error: Unable to parse ML Program: at unknown location: Unknown opset 'CoreML7'." UserInfo={NSLocalizedDescription=Failed to par$
 
-This means the model requires the the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14.
+This means the model requires the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14.
diff --git a/docs/source/backends-mediatek.md b/docs/source/backends-mediatek.md
index a562cea13bd..34cd56f971b 100644
--- a/docs/source/backends-mediatek.md
+++ b/docs/source/backends-mediatek.md
@@ -23,7 +23,7 @@ The MediaTek backend enables acceleration of PyTorch models on edge devices with
   ```
 - NeuroPilot SDK Python wheels (download from [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress)):
   ```bash
-  pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+  pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
   pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
   ```
 
diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md
index f02f495f685..f4f7762c769 100644
--- a/docs/source/backends-nxp.md
+++ b/docs/source/backends-nxp.md
@@ -1,5 +1,79 @@
 # NXP eIQ Neutron Backend
 
-See
-[NXP eIQ Neutron Backend](https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md)
-for current status about running ExecuTorch on NXP eIQ Neutron Backend.
+This manual page is dedicated to introduction of using the ExecuTorch with NXP eIQ Neutron Backend.
+NXP offers accelerated machine learning models inference on edge devices.
+To learn more about NXP's machine learning acceleration platform, please refer to [the official NXP website](https://www.nxp.com/applications/technologies/ai-and-machine-learning:MACHINE-LEARNING).
+
+<div class="admonition tip">
+For up-to-date status about running ExecuTorch on Neutron Backend please visit the <a href="https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md">manual page</a>.
+</div>
+
+## Features
+
+ExecuTorch v1.0 supports running machine learning models on selected NXP chips (for now only i.MXRT700).
+Among currently supported machine learning models are:
+- Convolution-based neutral networks
+- Full support for MobileNetV2 and CifarNet
+
+## Prerequisites (Hardware and Software)
+
+In order to successfully build ExecuTorch project and convert models for NXP eIQ Neutron Backend you will need a computer running Linux.
+
+If you want to test the runtime, you'll also need:
+- Hardware with NXP's [i.MXRT700](https://www.nxp.com/products/i.MX-RT700) chip or a testing board like MIMXRT700-AVK
+- [MCUXpresso IDE](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-integrated-development-environment-ide:MCUXpresso-IDE) or [MCUXpresso Visual Studio Code extension](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-for-visual-studio-code:MCUXPRESSO-VSC)
+
+## Using NXP backend
+
+To test converting a neural network model for inference on NXP eIQ Neutron Backend, you can use our example script:
+
+```shell
+# cd to the root of executorch repository
+./examples/nxp/aot_neutron_compile.sh [model (cifar10 or mobilenetv2)]
+```
+
+For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+
+### Partitioner API
+
+The partitioner is defined in `NeutronPartitioner` in `backends/nxp/neutron_partitioner.py`. It has the following
+arguments:
+* `compile_spec` - list of key-value pairs defining compilation. E.g. for specifying platform (i.MXRT700) and Neutron Converter flavor.
+* `custom_delegation_options` - custom options for specifying node delegation.
+
+### Quantization
+
+The quantization for Neutron Backend is defined in `NeutronQuantizer` in `backends/nxp/quantizer/neutron_quantizer.py`.
+The quantization follows PT2E workflow, INT8 quantization is supported. Operators are quantized statically, activations
+follow affine and weights symmetric per-tensor quantization scheme.
+
+#### Supported operators
+
+List of Aten operators supported by Neutron quantizer:
+
+`abs`, `adaptive_avg_pool2d`, `addmm`, `add.Tensor`, `avg_pool2d`, `cat`, `conv1d`, `conv2d`, `dropout`,
+`flatten.using_ints`, `hardtanh`, `hardtanh_`, `linear`, `max_pool2d`, `mean.dim`, `pad`, `permute`, `relu`, `relu_`,
+`reshape`, `view`, `softmax.int`, `sigmoid`, `tanh`, `tanh_`
+
+#### Example
+```python
+import torch
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+# Prepare your model in Aten dialect
+aten_model = get_model_in_aten_dialect()
+# Prepare calibration inputs, each tuple is one example, example tuple has items for each model input
+calibration_inputs: list[tuple[torch.Tensor, ...]] = get_calibration_inputs()
+quantizer = NeutronQuantizer()
+
+m = prepare_pt2e(aten_model, quantizer)
+for data in calibration_inputs:
+    m(*data)
+m = convert_pt2e(m)
+```
+
+## Runtime Integration
+
+To learn how to run the converted model on the NXP hardware, use one of our example projects on using ExecuTorch runtime from MCUXpresso IDE example projects list.
+For more finegrained tutorial, visit [this manual page](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html).
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
index c83ace26853..4a3313964a8 100644
--- a/docs/source/backends-overview.md
+++ b/docs/source/backends-overview.md
@@ -1,21 +1,64 @@
-# Backend Overview
+# Backends
 
-ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
+## Backend Overview
 
-The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+ExecuTorch backends provide hardware acceleration for specific hardware targets, enabling models to run efficiently on devices ranging from mobile phones to embedded systems and DSPs. During the export and lowering process, ExecuTorch optimizes your model for the chosen backend, resulting in a `.pte` file specialized for that hardware. To support multiple platforms (e.g., Core ML on iOS, Arm CPU on Android), you typically generate a dedicated `.pte` file for each backend.
 
-As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
+The choice of backend is informed by the hardware your model will run on. Each backend has its own hardware requirements and level of model/operator support. See the documentation for each backend for details.
 
-### Available Backends
+As part of `.pte` file creation, ExecuTorch identifies model partitions supported by the backend. These are processed ahead of time for efficient execution. Operators not supported by the delegate are executed using the portable CPU fallback (e.g., XNNPACK), allowing for partial acceleration. You can also specify multiple partitioners in order of priority, so unsupported GPU ops can fall back to CPU, for example.
 
-Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation for more information.
+---
 
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
-- [Core ML (iOS)](backends-coreml.md)
-- [Metal Performance Shaders (iOS GPU)](backends-mps.md)
-- [Vulkan (Android GPU)](backends-vulkan.md)
-- [Qualcomm NPU](backends-qualcomm.md)
-- [MediaTek NPU](backends-mediatek.md)
-- [ARM Ethos-U NPU](backends-arm-ethos-u.md)
-- [ARM VGF](backends-arm-vgf.md)
-- [Cadence DSP](backends-cadence.md)
+## Why Backends Matter
+
+Backends are the bridge between your exported model and the hardware it runs on. Choosing the right backend ensures your model takes full advantage of device-specific acceleration, balancing performance, compatibility, and resource usage.
+
+---
+
+## Choosing a Backend
+
+| Backend                                  | Platform(s)         | Hardware Type | Typical Use Case                |
+|------------------------------------------|---------------------|---------------|---------------------------------|
+| [XNNPACK](backends-xnnpack)              | All                 | CPU           | General-purpose, fallback       |
+| [Core ML](backends-coreml)               | iOS, macOS          | NPU/GPU       | Apple devices, high performance |
+| [Metal Performance Shaders](backends-mps)| iOS, macOS          | GPU           | Apple GPU acceleration          |
+| [Vulkan ](backends-vulkan)               | Android             | GPU           | Android GPU acceleration        |
+| [Qualcomm](backends-qualcomm)            | Android             | NPU           | Qualcomm SoCs                   |
+| [MediaTek](backends-mediatek)            | Android             | NPU           | MediaTek SoCs                   |
+| [ARM EthosU](backends-arm-ethos-u)       | Embedded            | NPU           | ARM MCUs                        |
+| [ARM VGF](backends-arm-vgf)              | Android             | NPU           | ARM platforms                   |
+| [OpenVINO](build-run-openvino)           | Embedded            | CPU/GPU/NPU   | Intel  SoCs                     |
+| [NXP](backends-nxp)                      | Embedded            | NPU           | NXP SoCs                        |
+| [Cadence](backends-cadence)              | Embedded            | DSP           | DSP-optimized workloads         |
+| [Samsung Exynos](backends-samsung-exynos)| Android             | NPU           | Samsung SoCs                    |
+
+**Tip:** For best performance, export a `.pte` file for each backend you plan to support.
+
+---
+
+## Best Practices
+
+- **Test on all target devices:** Operator support may vary by backend.
+- **Use fallback wisely:** If a backend doesn't support an operator, ExecuTorch will run it on CPU.
+- **Consult backend docs:** Each backend has unique setup and tuning options.
+
+---
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+:caption: Backend Overview
+
+backends-xnnpack
+backends-coreml
+backends-mps
+backends-vulkan
+backends-qualcomm
+backends-mediatek
+backends-arm-ethos-u
+backends-arm-vgf
+build-run-openvino
+backends-nxp
+backends-cadence
+backends-samsung-exynos
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 59634b9b39b..74089885fcf 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -74,10 +74,9 @@ This example is verified with SM8550 and SM8450.
  - A compiler to compile AOT parts, e.g., the GCC compiler comes with Ubuntu LTS.
  - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 26c.
  - [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk)
-   - Click the "Get Software" button to download a version of QNN SDK.
-   - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6.
-   - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon.
-   - [QNN 2.37.0](https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/2.37.0.250724/v2.37.0.250724.zip)
+   - Click the "Get Software" button to download the latest version of the QNN SDK.
+   - Although newer versions are available, we have verified and recommend using QNN 2.37.0 for stability.
+   - You can download it directly from the following link: [QNN 2.37.0](https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/2.37.0.250724/v2.37.0.250724.zip)
 
 The directory with installed Qualcomm AI Engine Direct SDK looks like:
 ```
@@ -136,86 +135,6 @@ cd $EXECUTORCH_ROOT
 ./backends/qualcomm/scripts/build.sh --release
 ```
 
-### AOT (Ahead-of-time) components:
-
-Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct binary.
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build-x86
-cd build-x86
-# Note that the below command might change.
-# Please refer to the above build.sh for latest workable commands.
-cmake .. \
-  -DCMAKE_INSTALL_PREFIX=$PWD \
-  -DEXECUTORCH_BUILD_QNN=ON \
-  -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-  -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-  -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-  -DPYTHON_EXECUTABLE=python3
-
-# nproc is used to detect the number of available CPU.
-# If it is not applicable, please feel free to use the number you want.
-cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
-
-# install Python APIs to correct import path
-# The filename might vary depending on your Python and host version.
-cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-
-# Workaround for .fbs files in exir/_serialize
-cp $EXECUTORCH_ROOT/schema/program.fbs $EXECUTORCH_ROOT/exir/_serialize/program.fbs
-cp $EXECUTORCH_ROOT/schema/scalar_type.fbs $EXECUTORCH_ROOT/exir/_serialize/scalar_type.fbs
-```
-
-### Runtime:
-
-An example `qnn_executor_runner` executable would be used to run the compiled `pte` model.
-
-Commands to build `qnn_executor_runner` for Android:
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build-android
-cd build-android
-# build executorch & qnn_executorch_backend
-cmake .. \
-    -DCMAKE_INSTALL_PREFIX=$PWD \
-    -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-    -DPYTHON_EXECUTABLE=python3 \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_PLATFORM=android-30
-
-# nproc is used to detect the number of available CPU.
-# If it is not applicable, please feel free to use the number you want.
-cmake --build $PWD --target install -j$(nproc)
-
-cmake ../examples/qualcomm \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_PLATFORM=android-30 \
-    -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
-    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-    -DPYTHON_EXECUTABLE=python3 \
-    -Bexamples/qualcomm
-
-cmake --build examples/qualcomm -j$(nproc)
-
-# qnn_executor_runner can be found under examples/qualcomm
-# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/executor_runner/qnn_executor_runner
-ls examples/qualcomm
-```
-
-**Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options.
-
 
 ## Deploying and running on device
 
@@ -319,7 +238,7 @@ adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV79Stub.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR}
 ```
 
 ***Step 2***.  We also need to indicate dynamic linkers on Android and Hexagon
@@ -365,13 +284,13 @@ The model, inputs, and output location are passed to `qnn_executorch_runner` by
 
 ## Supported model list
 
-Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
+Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `$EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
 
 ## How to Support a Custom Model in HTP Backend
 
 ### Step-by-Step Implementation Guide
 
-Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more compilated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference
+Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more complicated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference
 #### Step 1: Prepare Your Model
 ```python
 import torch
@@ -478,4 +397,4 @@ print(f"Model successfully exported to {model_name}")
 ## FAQ
 
 If you encounter any issues while reproducing the tutorial, please file a github
-issue on ExecuTorch repo and tag use `#qcom_aisw` tag
+[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
diff --git a/docs/source/backends-samsung-exynos.md b/docs/source/backends-samsung-exynos.md
new file mode 100644
index 00000000000..0d77936bf7f
--- /dev/null
+++ b/docs/source/backends-samsung-exynos.md
@@ -0,0 +1 @@
+# Samsung Exynos Backend (TBD)
diff --git a/docs/source/backends-section.md b/docs/source/backends-section.md
new file mode 100644
index 00000000000..29a235a9416
--- /dev/null
+++ b/docs/source/backends-section.md
@@ -0,0 +1 @@
+```{include} backends-overview.md
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
index d1a120e69fa..42e76741ec8 100644
--- a/docs/source/backends-xnnpack.md
+++ b/docs/source/backends-xnnpack.md
@@ -67,10 +67,11 @@ The XNNPACK delegate can also be used as a backend to execute symmetrically quan
 
 ### Supported Quantization Schemes
 The XNNPACK delegate supports the following quantization schemes:
+
 - 8-bit symmetric weights with 8-bit asymmetric activations (via the PT2E quantization flow).
-    - Supports both static and dynamic activations.
-    - Supports per-channel and per-tensor schemes.
-    - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
+  - Supports both static and dynamic activations.
+  - Supports per-channel and per-tensor schemes.
+  - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
 
 Weight-only quantization is not currently supported on XNNPACK.
 
@@ -81,7 +82,7 @@ To perform 8-bit quantization with the PT2E flow, perform the following steps pr
 1) Create an instance of the `XnnpackQuantizer` class. Set quantization parameters.
 2) Use `torch.export.export` to prepare for quantization.
 3) Call `prepare_pt2e` to prepare the model for quantization.
-4) For static quantization, run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
+4) For static quantization, run the prepared model with representative samples to calibrate the quantized tensor activation ranges.
 5) Call `convert_pt2e` to quantize the model.
 6) Export and lower the model using the standard flow.
 
diff --git a/docs/source/backends.md b/docs/source/backends.md
deleted file mode 100644
index 53db638f36d..00000000000
--- a/docs/source/backends.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# Backends
-
-```{toctree}
-:maxdepth: 1
-
-backends-overview
-backends-xnnpack
-backends-coreml
-backends-mps
-backends-vulkan
-backends-arm-ethos-u
-backends-qualcomm
-backends-mediatek
-backends-cadence
-OpenVINO Backend <build-run-openvino>
-backends-nxp
-```
diff --git a/docs/source/build-run-openvino.md b/docs/source/build-run-openvino.md
index dc6f098850f..9b4c48fee5a 100644
--- a/docs/source/build-run-openvino.md
+++ b/docs/source/build-run-openvino.md
@@ -61,7 +61,7 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](getting-started-setup.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
+1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](using-executorch-building-from-source.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup OpenVINO Backend Environment**
 - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
@@ -92,7 +92,7 @@ The exported model will be saved as 'resnet50.pte' in the current directory.
 
 ### Build C++ OpenVINO Examples
 
-After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `<executorch_root>/cmake-out/backends/openvino/`.
+After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `<executorch_root>/cmake-out/`.
 
 The executable requires a model file (`.pte` file generated in the aot step) and the number of inference executions.
 
@@ -101,7 +101,7 @@ The executable requires a model file (`.pte` file generated in the aot step) and
 Run inference with a given model for 10 executions:
 
 ```
-./openvino_executor_runner \
+./executor_runner \
     --model_path=model.pte \
     --num_executions=10
 ```
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index 79897737268..c0b03938374 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -17,7 +17,7 @@ This stage mainly focuses on the creation of a `BundledProgram` and dumping it o
 
 ### Step 1: Create a Model and Emit its ExecuTorch Program.
 
-ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 
 ### Step 2: Construct `List[MethodTestSuite]` to hold test info
 
@@ -194,7 +194,7 @@ regenerate_bundled_program = deserialize_from_flatbuffer_to_bundled_program(seri
 ```
 
 ## Runtime Stage
-This stage mainly focuses on executing the model with the bundled inputs and and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
+This stage mainly focuses on executing the model with the bundled inputs and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
 
 
 ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index c633bb1fd12..b057f3afa2e 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -1,4 +1,4 @@
-# Backends and Delegates
+# Understanding Backends and Delegates
 
 Audience: Vendors, Backend Delegate developers, who are interested in integrating their own compilers and hardware as part of ExecuTorch
 
@@ -37,7 +37,7 @@ The diagram looks like following
 There are mainly two Ahead-of-Time entry point for backend to implement: `partition` and `preprocess`.
 
 `partitioner` is an algorithm implemented by the backend to tag the nodes to be lowered to the backend. `to_backend` API will apply the partition algorithm and lower each subgraph, which consists of connected tagged nodes, to the targeted backend. Every subgraph
-will be sent to the `preprocess` part provided by the backend to compiled as a binary blob.
+will be sent to the `preprocess` part provided by the backend to be compiled as a binary blob.
 
 During partition, the `exported_program` is not allowed to mutate the program, and it's supposed to apply tag to each node. The
 `PartitionResult` includes both tagged exported program and the partition tags dictionary for `to_backend` to look up the tag and
@@ -194,8 +194,8 @@ qnnpack is one backend and xnnpack is another backend. We haven't open-sourced
 these two backends delegates yet, and this example won't run out of box. It can
 be used as a reference to see how it can be done.
 
-This option is easy to try becuase usually all backends will implement their own
-parititioner. However this option may get different results if we change the
+This option is easy to try because usually all backends will implement their own
+partitioner. However this option may get different results if we change the
 order of to_backend call. If we want to have a better control on the nodes, like
 which backend they should go, option 2 is better.
 
diff --git a/docs/source/compiler-ir-advanced.md b/docs/source/compiler-ir-advanced.md
new file mode 100644
index 00000000000..b6d24026d5a
--- /dev/null
+++ b/docs/source/compiler-ir-advanced.md
@@ -0,0 +1,31 @@
+(compiler-ir-advanced)=
+# Compiler & IR
+
+Advanced compiler features and intermediate representation specifications.
+
+## Compiler Passes
+
+- {doc}`compiler-custom-compiler-passes` — Custom compiler passes and optimization
+
+## Memory Management
+
+- {doc}`compiler-memory-planning` — Advanced memory planning strategies
+
+## Intermediate Representation
+
+- {doc}`ir-exir` — EXIR (Export Intermediate Representation) specification
+- {doc}`ir-ops-set-definition` — Ops set definition and operator standardization
+
+## Backend dialect
+
+- {doc}`compiler-backend-dialect` — Backend dialect and compiler integration
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+compiler-custom-compiler-passes
+compiler-memory-planning
+ir-exir
+ir-ops-set-definition
+compiler-backend-dialect
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f1869d38a46..31abdef2820 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 import sys
 from typing import Any
 
-import pytorch_sphinx_theme2  # type: ignore[import-untyped]
+import pytorch_sphinx_theme2  # type: ignore[import-not-found]
 
 # To let us import ./custom_directives.py
 sys.path.insert(0, os.path.abspath("."))
@@ -74,7 +74,7 @@
     "xml",  # {repo_root}/docs/cpp/build/xml
 )
 
-html_favicon = "_static/img/ExecuTorch-Logo-cropped.svg"
+html_favicon = "_static/img/executorch-chip-logo.svg"
 
 # Get ET_VERSION_DOCS during the build.
 et_version_docs = os.environ.get("ET_VERSION_DOCS", None)
diff --git a/docs/source/desktop-backends.md b/docs/source/desktop-backends.md
new file mode 100644
index 00000000000..e4220edb47f
--- /dev/null
+++ b/docs/source/desktop-backends.md
@@ -0,0 +1,27 @@
+(desktop-backends)=
+# Backends
+
+Available hardware acceleration backends for desktop platforms.
+
+## Linux Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+## macOS Backends
+
+- {doc}`desktop-coreml` — CoreML (recommended for Apple Silicon)
+- {doc}`desktop-mps` — Metal Performance Shaders (Apple Silicon GPU)
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+
+## Windows Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+```{toctree}
+:hidden:
+desktop-xnnpack
+desktop-openvino
+desktop-coreml
+desktop-mps
diff --git a/docs/source/desktop-coreml.md b/docs/source/desktop-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/desktop-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/desktop-mps.md b/docs/source/desktop-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/desktop-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/desktop-openvino.md b/docs/source/desktop-openvino.md
new file mode 100644
index 00000000000..a0fd5774c73
--- /dev/null
+++ b/docs/source/desktop-openvino.md
@@ -0,0 +1 @@
+```{include} build-run-openvino.md
diff --git a/docs/source/desktop-section.md b/docs/source/desktop-section.md
new file mode 100644
index 00000000000..7afccbe1d4f
--- /dev/null
+++ b/docs/source/desktop-section.md
@@ -0,0 +1,19 @@
+(desktop-section)=
+# Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends for each platform.
+
+## Platform Overview & Runtime
+
+- {doc}`using-executorch-cpp` — C++ runtime integration guide
+- {doc}`using-executorch-building-from-source` — Building ExecuTorch from source
+
+## Backends
+
+- {doc}`desktop-backends` — Available desktop backends and platform-specific optimization
+
+```{toctree}
+:hidden:
+using-executorch-cpp
+using-executorch-building-from-source
+desktop-backends
diff --git a/docs/source/desktop-xnnpack.md b/docs/source/desktop-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/desktop-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md
index 449dd1485dc..8e13e67f1a1 100644
--- a/docs/source/devtools-overview.md
+++ b/docs/source/devtools-overview.md
@@ -41,6 +41,6 @@ More details are available in the [ETDump documentation](etdump.md) on how to ge
 
 
 ### Inspector APIs
-The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
+The Inspector Python APIs are the main user entry point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
 
 More details are available in the [Inspector API documentation](model-inspector.rst) on how to use the Inspector APIs.
diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md
index 7c6cedc311b..6d540dc7f35 100644
--- a/docs/source/devtools-tutorial.md
+++ b/docs/source/devtools-tutorial.md
@@ -1,3 +1,3 @@
 ## Developer Tools Usage Tutorial
 
-Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
+Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
diff --git a/docs/source/edge-platforms-section.md b/docs/source/edge-platforms-section.md
new file mode 100644
index 00000000000..99e44093544
--- /dev/null
+++ b/docs/source/edge-platforms-section.md
@@ -0,0 +1,73 @@
+(edge-platforms-section)=
+# Edge
+
+Deploy ExecuTorch on mobile, desktop, and embedded platforms with optimized backends for each.
+
+ExecuTorch supports deployment across a wide variety of edge computing platforms, from high-end mobile devices to constrained embedded systems and microcontrollers.
+
+## Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+**→ {doc}`android-section` — Complete Android deployment guide**
+
+Key features:
+- Hardware acceleration support (CPU, GPU, NPU)
+- Multiple backend options (XNNPACK, Vulkan, Qualcomm, MediaTek, ARM, Samsung)
+- Comprehensive examples and demos
+
+## iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+**→ {doc}`ios-section` — Complete iOS deployment guide**
+
+Key features:
+- Apple hardware optimization (CoreML, MPS, XNNPACK)
+- Swift and Objective-C integration
+- LLM and computer vision examples
+
+## Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends.
+
+**→ {doc}`desktop-section` — Complete desktop deployment guide**
+
+Key features:
+- Cross-platform C++ runtime
+- Platform-specific optimization (OpenVINO, CoreML, MPS)
+- CPU and GPU acceleration options
+
+## Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+**→ {doc}`embedded-section` — Complete embedded deployment guide**
+
+Key features:
+
+- Resource-constrained deployment
+- DSP and NPU acceleration (Cadence, ARM Ethos-U, NXP)
+- Custom backend development support
+- LLM and computer vision examples
+
+## Troubleshooting & Support
+
+- **{doc}`using-executorch-troubleshooting`** - Common issues and solutions across all platforms
+
+## Next Steps
+
+After choosing your platform:
+- **{doc}`backends-section`** - Deep dive into backend selection and optimization
+- **{doc}`llm/working-with-llms`** - Working with Large Language Models on edge devices
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Edge Platforms
+
+android-section
+ios-section
+desktop-section
+embedded-section
+using-executorch-troubleshooting
diff --git a/docs/source/embedded-arm-ethos-u.md b/docs/source/embedded-arm-ethos-u.md
new file mode 100644
index 00000000000..cdc544a6553
--- /dev/null
+++ b/docs/source/embedded-arm-ethos-u.md
@@ -0,0 +1 @@
+```{include} backends-arm-ethos-u.md
diff --git a/docs/source/embedded-backends.md b/docs/source/embedded-backends.md
new file mode 100644
index 00000000000..4ed7962ef42
--- /dev/null
+++ b/docs/source/embedded-backends.md
@@ -0,0 +1,20 @@
+(embedded-backends)=
+# Backends
+
+Available hardware acceleration backends for embedded systems.
+
+## DSP Acceleration
+
+- {doc}`embedded-cadence` — Cadence Xtensa DSP processors
+
+## NPU Acceleration
+
+- {doc}`embedded-arm-ethos-u` — ARM Ethos-U NPU acceleration
+- {doc}`embedded-nxp` — NXP eIQ Neutron Backend
+
+
+```{toctree}
+:hidden:
+embedded-cadence
+embedded-arm-ethos-u
+embedded-nxp
diff --git a/docs/source/embedded-cadence.md b/docs/source/embedded-cadence.md
new file mode 100644
index 00000000000..d2f7ea78259
--- /dev/null
+++ b/docs/source/embedded-cadence.md
@@ -0,0 +1 @@
+```{include} backends-cadence.md
diff --git a/docs/source/embedded-nxp.md b/docs/source/embedded-nxp.md
new file mode 100644
index 00000000000..35d8f0ab75d
--- /dev/null
+++ b/docs/source/embedded-nxp.md
@@ -0,0 +1 @@
+```{include} backends-nxp.md
diff --git a/docs/source/embedded-section.md b/docs/source/embedded-section.md
new file mode 100644
index 00000000000..834001afbc3
--- /dev/null
+++ b/docs/source/embedded-section.md
@@ -0,0 +1,39 @@
+(embedded-section)=
+
+# Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+## API Reference & Development
+
+Start here for C++ development with ExecuTorch runtime APIs and essential tutorials.
+
+- {doc}`executorch-runtime-api-reference` — **Start here**: Complete runtime API reference for embedded development
+- {doc}`running-a-model-cpp-tutorial` — Step-by-step C++ API tutorial with practical examples
+- {doc}`extension-module` — Custom module extensions for specialized functionality
+- {doc}`extension-tensor` — Tensor operations and memory management extensions
+
+## Build & Integration Guide
+
+- {doc}`using-executorch-cpp` — Complete setup guide for C++ runtime integration
+- {doc}`using-executorch-building-from-source` — Building from Source
+
+## Choose Backend for acceleration
+
+- {doc}`embedded-backends` — Available embedded backends and acceleration options
+
+## Tutorials
+
+- {doc}`tutorial-arm-ethos-u` — Export a simple PyTorch model for the ExecuTorch Ethos-U backend
+
+
+```{toctree}
+:hidden:
+executorch-runtime-api-reference
+running-a-model-cpp-tutorial
+extension-module
+extension-tensor
+using-executorch-cpp
+using-executorch-building-from-source
+embedded-backends
+tutorial-arm-ethos-u
diff --git a/docs/source/etrecord.rst b/docs/source/etrecord.rst
index 1ab84a6ee10..39bc45cab5a 100644
--- a/docs/source/etrecord.rst
+++ b/docs/source/etrecord.rst
@@ -23,13 +23,120 @@ It should be provided to the `Inspector API <model-inspector.html>`__ to link ba
 Generating an ``ETRecord``
 --------------------------
 
-The user should use the following API to generate an ``ETRecord`` file. They
-will be expected to provide the Edge Dialect program (returned by the call to ``to_edge()``),
-the ExecuTorch program (returned by the call to ``to_executorch()``), and optional models that
-they are interested in working with via our tooling.
+There are multiple ways to generate an ``ETRecord`` for debugging purposes:
+
+Method 1: Using the ``generate_etrecord`` Parameter (Recommended)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The recommended approach is to enable ``ETRecord`` generation by passing ``generate_etrecord=True``
+to your export API calls. This can be used with:
+
+* ``executorch.export()`` - High-level export API
+* ``to_edge()`` - Edge dialect conversion
+* ``to_edge_transform_and_lower()`` - Edge conversion with transformations and lowering
+
+After export completes, retrieve the ``ETRecord`` using the ``get_etrecord()`` method, and save it using the ``save()`` method:
+
+**Example with** ``executorch.export()``:
+
+.. code-block:: python
+
+    import executorch
+    from executorch.export import ExportRecipe
+
+    # Export with ETRecord generation enabled
+    session = executorch.export(
+        model=model,
+        example_inputs=[example_inputs],
+        export_recipe=recipe,
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    # Get and save the ETRecord
+    etrecord = session.get_etrecord()
+    etrecord.save("model_debug.etrecord")
+
+**Example with** ``to_edge()``:
+
+.. code-block:: python
+
+    from executorch.exir.program import to_edge
+    from torch.export import export
+
+    # Export model first
+    exported_program = export(model, example_inputs)
+
+    # Convert to edge with ETRecord generation
+    edge_manager = to_edge(
+        exported_program,
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    # Apply transformations
+    edge_manager = edge_manager.to_backend()
+    et_manager = edge_manager.to_executorch()
+
+    # Get and save ETRecord
+    etrecord = et_manager.get_etrecord()
+    etrecord.save("edge_debug.etrecord")
+
+**Example with** ``to_edge_transform_and_lower()``:
+
+.. code-block:: python
+
+    from executorch.exir.program import to_edge_transform_and_lower
+    from torch.export import export
+
+    # Export model first
+    exported_program = export(model, example_inputs)
+
+    # Transform and lower with ETRecord generation
+    edge_manager = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[MyPartitioner()],
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    et_manager = edge_manager.to_executorch()
+
+    # Get and save ETRecord
+    etrecord = et_manager.get_etrecord()
+    etrecord.save("debug.etrecord")
+
+Method 2: Using the ``generate_etrecord()`` Function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can also use the standalone ``generate_etrecord()`` function to generate an ``ETRecord``.
+This method requires you to provide the Edge Dialect program (returned by ``to_edge()``),
+the ExecuTorch program (returned by ``to_executorch()``), and optional models.
 
 .. warning::
-    Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
+    When using the standalone function, users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
+
+**Example:**
+
+.. code-block:: python
+
+    import copy
+    from executorch.devtools import generate_etrecord
+    from torch.export import export
+
+    # Export and convert to edge
+    aten_dialect = export(model, example_inputs, strict=True)
+    edge_program = to_edge(aten_dialect)
+
+    # Create copy for ETRecord (needed because to_executorch modifies in-place)
+    edge_program_copy = copy.deepcopy(edge_program)
+
+    # Convert to ExecutorchProgramManager
+    executorch_program = edge_program_copy.to_executorch()
+
+    # Generate ETRecord separately
+    generate_etrecord(
+        "debug.etrecord",
+        edge_program,
+        executorch_program,
+    )
 
 .. currentmodule:: executorch.devtools.etrecord._etrecord
 .. autofunction:: generate_etrecord
diff --git a/docs/source/export-overview.md b/docs/source/export-overview.md
index d07701d06cd..c96716a0949 100644
--- a/docs/source/export-overview.md
+++ b/docs/source/export-overview.md
@@ -11,5 +11,5 @@ program, making it easier for you to understand and implement the process.
 
 To learn more about exporting your model:
 
-* Complete the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+* Complete the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 * Read the [torch.export documentation](https://pytorch.org/docs/2.1/export.html).
diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md
index 29aa6712d37..690256fecbb 100644
--- a/docs/source/extension-module.md
+++ b/docs/source/extension-module.md
@@ -6,7 +6,7 @@ In the [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial.md), we
 
 ## Example
 
-Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
+Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
 
 ```cpp
 #include <executorch/extension/module/module.h>
diff --git a/docs/source/file-formats-advanced.md b/docs/source/file-formats-advanced.md
new file mode 100644
index 00000000000..c16ebccfd65
--- /dev/null
+++ b/docs/source/file-formats-advanced.md
@@ -0,0 +1,17 @@
+(file-formats-advanced)=
+
+# File Formats
+
+ExecuTorch file format specifications and internal structure.
+
+## Program File Formats
+
+- {doc}`pte-file-format` — PTE (PyTorch ExecuTorch) file format specification
+- {doc}`ptd-file-format` — PTD file format specification
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+pte-file-format
+ptd-file-format
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index ef4a12d1a7f..617d521b802 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -4,7 +4,7 @@ This page describes the technical architecture of ExecuTorch and its individual
 
 **Context**
 
-In order to target on-device AI with diverse hardware, critical power requirements, and realtime processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extendable architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms.
+In order to target on-device AI with diverse hardware, critical power requirements, and real-time processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extensible architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms.
 
 ## Overview
 
@@ -89,6 +89,6 @@ _Executor_ is the entry point to load the program and execute it. The execution
 
 ## Developer Tools
 
-It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
+It should be efficient for users to go from research to production using the flow above. Productivity is especially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
 
 During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
index d3d9662f5c3..51c59f5e021 100644
--- a/docs/source/getting-started.md
+++ b/docs/source/getting-started.md
@@ -68,7 +68,7 @@ with open("model.pte", "wb") as f:
 
 If the model requires varying input sizes, you will need to specify the varying dimensions and bounds as part of the `export` call. See [Model Export and Lowering](using-executorch-export.md) for more information.
 
-The hardware backend to target is controlled by the partitioner parameter to to\_edge\_transform\_and\_lower. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend.
+The hardware backend to target is controlled by the partitioner parameter to `to_edge_transform_and_lower`. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend.
 
 Quantization can also be done at this stage to reduce model size and runtime. Quantization is backend-specific. See the documentation for the target backend for a full description of supported quantization schemes.
 
@@ -89,7 +89,7 @@ input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224)
 program = runtime.load_program("model.pte")
 method = program.load_method("forward")
 output: List[torch.Tensor] = method.execute([input_tensor])
-print("Run succesfully via executorch")
+print("Run successfully via executorch")
 
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 import torchvision.models as models
@@ -226,5 +226,5 @@ ExecuTorch provides a high-degree of customizability to support diverse hardware
 - [Using ExecuTorch on Android](using-executorch-android.md) and [Using ExecuTorch on iOS](using-executorch-ios.md) for mobile runtime integration.
 - [Using ExecuTorch with C++](using-executorch-cpp.md) for embedded and mobile native development.
 - [Profiling and Debugging](using-executorch-troubleshooting.md) for developer tooling and debugging.
-- [API Reference](export-to-executorch-api-reference.md) for a full description of available APIs.
+- [API Reference](export-to-executorch-api-reference.rst) for a full description of available APIs.
 - [Examples](https://github.com/pytorch/executorch/tree/main/examples) for demo apps and example code.
diff --git a/docs/source/index.md b/docs/source/index.md
index fd0957d8fd4..b65139319a7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,134 +1,195 @@
 (home)=
 # Welcome to the ExecuTorch Documentation
 
-**ExecuTorch** is PyTorch's solution to training and inference on the
-Edge.
+**ExecuTorch** is PyTorch's solution for efficient AI inference on edge devices — from mobile phones to embedded systems.
 
 ## Key Value Propositions
 
-- **Portability:** Compatibility with a wide variety of computing
-  platforms, from high-end mobile phones to highly constrained
-  embedded systems and microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and
-  Developer Tools from PyTorch model authoring and conversion, to
-  debugging and deployment to a wide variety of platforms.
-- **Performance:** Providing end users with a seamless and
-  high-performance experience due to a lightweight runtime and
-  utilizing full hardware capabilities such as CPUs, NPUs, and DSPs.
-
-ExecuTorch provides support for:
-
-* **Strong Model Support** LLMs (Large Language Models),
-  CV (Computer Vision), ASR (Automatic Speech Recognition), TTS (Text To Speech)
-* **All Major Platforms** Android, Mac, Linux, Windows
-* **Rich Acceleration Support** Apple, Arm, Cadence, MediaTek, NXP, OpenVino, Qualcomm, Vulkan, XNNPACK
-
-### Documentation Navigation
-#### Introduction
-- [Overview](intro-overview)
-- [How it Works](intro-how-it-works)
-- [Getting Started with Architecture](getting-started-architecture)
-- [Concepts](concepts)
-#### Usage
-- [Getting Started](getting-started)
-- [Using Executorch Export](using-executorch-export)
-- [Using Executorch on Android](using-executorch-android)
-- [Using Executorch on iOS](using-executorch-ios)
-- [Using Executorch with C++](using-executorch-cpp)
-- [Runtime Integration](using-executorch-runtime-integration)
-- [Troubleshooting](using-executorch-troubleshooting)
-- [Building from Source](using-executorch-building-from-source)
-- [Quantization](quantization-overview)
-- [FAQs](using-executorch-faqs)
-#### Examples
-- [Android Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
-- [iOS Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
-- [Hugging Face Models](https://github.com/huggingface/optimum-executorch/blob/main/README.md)
-#### Backends
-- [Overview](backends-overview)
-- [XNNPACK](backends-xnnpack)
-- [Core ML](backends-coreml)
-- [MPS](backends-mps)
-- [Vulkan](backends-vulkan)
-- [ARM Ethos-U](backends-arm-ethos-u)
-- [ARM VGF](backends-arm-vgf)
-- [Qualcomm](backends-qualcomm)
-- [MediaTek](backends-mediatek)
-- [Cadence](backends-cadence)
-- [OpenVINO](build-run-openvino)
-- [NXP](backend-nxp)
-#### Developer Tools
-- [Overview](devtools-overview)
-- [Bundled IO](bundled-io)
-- [ETRecord](etrecord)
-- [ETDump](etdump)
-- [Runtime Profiling](runtime-profiling)
-- [Model Debugging](model-debugging)
-- [Model Inspector](model-inspector)
-- [Memory Planning Inspection](memory-planning-inspection)
-- [Delegate Debugging](delegate-debugging)
-- [Tutorial](devtools-tutorial)
-#### Runtime
-- [Overview](runtime-overview)
-- [Extension Module](extension-module)
-- [Extension Tensor](extension-tensor)
-- [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial)
-- [Backend Delegate Implementation and Linking](runtime-backend-delegate-implementation-and-linking)
-- [Platform Abstraction Layer](runtime-platform-abstraction-layer)
-#### Portable C++ Programming
-- [PTE File Format](pte-file-format)
-- [PTD File Format](ptd-file-format)
-#### API Reference
-- [Export to Executorch API Reference](export-to-executorch-api-reference)
-- [Executorch Runtime API Reference](executorch-runtime-api-reference)
-- [Runtime Python API Reference](runtime-python-api-reference)
-- [API Life Cycle](api-life-cycle)
-- [Javadoc](https://pytorch.org/executorch/main/javadoc/)
-#### Kernel Library
-- [Overview](kernel-library-overview)
-- [Custom ATen Kernel](kernel-library-custom-aten-kernel)
-- [Selective Build](kernel-library-selective-build)
-#### Working with LLMs
-- [Getting Started](llm/getting-started.md)
-- [Exporting LLMs](llm/export-llm.md)
-- [Exporting custom LLMs](llm/export-custom-llm.md)
-- [Running with C++](llm/run-with-c-plus-plus.md)
-- [Running on Android (XNNPack)](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
-- [Running on Android (QNN)](llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md)
-- [Running on iOS](llm/run-on-ios.md)
-#### Backend Development
-- [Delegates Integration](backend-delegates-integration)
-- [XNNPACK Reference](backend-delegates-xnnpack-reference)
-- [Dependencies](backend-delegates-dependencies)
-- [Compiler Delegate and Partitioner](compiler-delegate-and-partitioner)
-- [Debug Backend Delegate](debug-backend-delegate)
-#### IR Specification
-- [EXIR](ir-exir)
-- [Ops Set Definition](ir-ops-set-definition)
-#### Compiler Entry Points
-- [Backend Dialect](compiler-backend-dialect)
-- [Custom Compiler Passes](compiler-custom-compiler-passes)
-- [Memory Planning](compiler-memory-planning)
-#### Contributing
-- [Contributing](contributing)
+- **Portability:** Run on diverse platforms, from high-end mobile to constrained microcontrollers
+- **Performance:** Lightweight runtime with full hardware acceleration (CPU, GPU, NPU, DSP)
+- **Productivity:** Use familiar PyTorch tools from authoring to deployment
+
+---
+
+## 🎯 Wins & Success Stories
+
+::::{grid} 1
+:class-container: success-showcase
+:::{grid-item-card}
+:class-header: bg-primary text-white
+:class-body: text-center
+[View All Success Stories →](success-stories)
+:::
+::::
+
+---
+
+## Quick Navigation
+
+::::{grid} 2
+
+:::{grid-item-card} **Get Started**
+:link: quick-start-section
+:link-type: doc
+
+New to ExecuTorch? Start here for installation and your first model deployment.
+:::
+
+:::{grid-item-card} **Deploy on Edge Platforms**
+:link: edge-platforms-section
+:link-type: doc
+
+Deploy on Android, iOS, Laptops / Desktops and embedded platforms with optimized backends.
+:::
+
+:::{grid-item-card} **Work with LLMs**
+:link: llm/working-with-llms
+:link-type: doc
+
+Export, optimize, and deploy Large Language Models on edge devices.
+:::
+
+:::{grid-item-card} 🔧 **Developer Tools**
+:link: tools-section
+:link-type: doc
+
+Profile, debug, and inspect your models with comprehensive tooling.
+:::
+
+::::
+
+---
+
+## Explore Documentation
+
+::::{grid} 1
+:::{grid-item-card} **Intro**
+:link: intro-section
+:link-type: doc
+
+**Overview, architecture, and core concepts** — Understand how ExecuTorch works and its benefits
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Quick Start**
+:link: quick-start-section
+:link-type: doc
+
+**Get started with ExecuTorch** — Install, export your first model, and run inference
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Edge**
+:link: edge-platforms-section
+:link-type: doc
+
+**Android, iOS, Desktop, Embedded** — Platform-specific deployment guides and examples
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Backends**
+:link: backends-section
+:link-type: doc
+
+**CPU, GPU, NPU/Accelerator backends** — Hardware acceleration and backend selection
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **LLMs**
+:link: llm/working-with-llms
+:link-type: doc
+
+**LLM export, optimization, and deployment** — Complete LLM workflow for edge devices
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Advanced**
+:link: advanced-topics-section
+:link-type: doc
+
+**Quantization, memory planning, custom passes** — Deep customization and optimization
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Tools**
+:link: tools-section
+:link-type: doc
+
+**Developer tools, profiling, debugging** — Comprehensive development and debugging suite
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **API**
+:link: api-section
+:link-type: doc
+
+**API Reference Usages & Examples** — Detailed Python, C++, and Java API references
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **💬 Support**
+:link: support-section
+:link-type: doc
+
+**FAQ, troubleshooting, contributing** — Get help and contribute to the project
+:::
+::::
+
+---
+
+## What's Supported
+
+::::{grid} 3
+
+:::{grid-item}
+**Model Types**
+
+- Large Language Models (LLMs)
+- Computer Vision (CV)
+- Speech Recognition (ASR)
+- Text-to-Speech (TTS)
+- More ...
+:::
+
+:::{grid-item}
+**Platforms**
+
+- Android & iOS
+- Linux, macOS, Windows
+- Embedded & MCUs
+- Go **→ {doc}`edge-platforms-section`**
+:::
+
+:::{grid-item}
+**Rich Acceleration**
+
+- CPU
+- GPU
+- NPU
+- DSP
+- Go **→ {doc}`backends-section`**
+:::
+
+::::
 
 ```{toctree}
-:glob:
-:maxdepth: 1
 :hidden:
+:maxdepth: 1
 
-intro
-usage
-examples
-backends
-developer-tools
-runtime
-api
-quantization
-kernel-library
+intro-section
+quick-start-section
+edge-platforms-section
+backends-section
 llm/working-with-llms
-backend-development
-ir-specification
-compiler-entry-points
-contributing
-```
+advanced-topics-section
+tools-section
+api-section
+support-section
diff --git a/docs/source/intro-how-it-works.md b/docs/source/intro-how-it-works.md
index 3e6d384a62f..3ced602fed4 100644
--- a/docs/source/intro-how-it-works.md
+++ b/docs/source/intro-how-it-works.md
@@ -6,7 +6,7 @@ At a high-level, there are three steps for running a PyTorch model with ExecuTor
 
 1. **Export the model.** The first step is to capture the PyTorch program as a graph, which is a new representation of the model that can be expressed in terms of a series of operators such as addition, multiplication, or convolution. This process safely preserves the semantics of the original PyTorch program. This representation is the first step to enable running the model on edge use cases that have low memory and/or low compute.
 1. **Compile the exported model to an ExecuTorch program.** Given an exported model from step 1, convert it to an executable format called an ExecuTorch program that the runtime can use for inference. This step provides entry points for various optimizations such as compressing the model (e.g., quantization) to reduce size and further compiling subgraphs down to on-device specialized hardware accelerators to improve latency. It also provides an entry point for memory planning, i.e. to efficiently plan the location of intermediate tensors to reduce the runtime memory footprint.
-1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enabled performant execution even on highly-constrained devices.
+1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enables performant execution even on highly-constrained devices.
 
 This figure illustrates the three-step process of exporting a PyTorch program, compiling it into an ExecuTorch program that targets a specific hardware device, and finally executing the program on the device using the ExecuTorch runtime.
 ![name](_static/img/how-executorch-works-high-level.png)
diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md
index 96c7982b8fe..be2fd468716 100644
--- a/docs/source/intro-overview.md
+++ b/docs/source/intro-overview.md
@@ -20,7 +20,7 @@ Key value propositions of ExecuTorch are:
 ## Why ExecuTorch?
 
 Supporting on-device AI presents unique challenges with diverse hardware,
-critical power requirements, low/no internet connectivity, and realtime
+critical power requirements, low/no internet connectivity, and real-time
 processing needs. These constraints have historically prevented or slowed down
 the creation of scalable and performant on-device AI solutions. We designed
 ExecuTorch, backed by our industry partners like Meta, Arm, Apple, and Qualcomm,
diff --git a/docs/source/intro-section.md b/docs/source/intro-section.md
new file mode 100644
index 00000000000..2f6f3c57c88
--- /dev/null
+++ b/docs/source/intro-section.md
@@ -0,0 +1,27 @@
+(intro-section)=
+
+# Intro
+
+Overview, architecture, and core concepts of ExecuTorch.
+
+ExecuTorch is PyTorch's solution for training and inference on the Edge, providing portability, productivity, and performance for edge computing platforms.
+
+## Getting Started with ExecuTorch
+
+New to ExecuTorch? Start with these foundational topics:
+
+- **{doc}`intro-overview`** - High-level overview of ExecuTorch capabilities
+- **{doc}`intro-how-it-works`** - Technical overview of the ExecuTorch workflow
+- **{doc}`getting-started-architecture`** - System architecture and components
+- **{doc}`concepts`** - Core concepts and terminology
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Introduction Topics
+
+intro-overview
+intro-how-it-works
+getting-started-architecture
+concepts
+```
diff --git a/docs/source/intro.md b/docs/source/intro.md
deleted file mode 100644
index f6609cc3ba7..00000000000
--- a/docs/source/intro.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Intro
-
-```{toctree}
-:maxdepth: 1
-
-intro-overview
-intro-how-it-works
-getting-started-architecture
-concepts
-```
diff --git a/docs/source/ios-backends.md b/docs/source/ios-backends.md
new file mode 100644
index 00000000000..cb186f53319
--- /dev/null
+++ b/docs/source/ios-backends.md
@@ -0,0 +1,19 @@
+(ios-backends)=
+# Backends
+
+Available hardware acceleration backends for iOS deployment.
+
+## Apple Hardware Acceleration (Recommended)
+
+- {doc}`ios-coreml` — CoreML (NPU/GPU, recommended for iOS)
+- {doc}`ios-mps` — Metal Performance Shaders (GPU)
+
+## CPU Acceleration
+
+- {doc}`ios-xnnpack` — XNNPACK (CPU acceleration)
+
+```{toctree}
+:hidden:
+ios-coreml
+ios-mps
+ios-xnnpack
diff --git a/docs/source/ios-coreml.md b/docs/source/ios-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/ios-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/ios-examples.md b/docs/source/ios-examples.md
new file mode 100644
index 00000000000..86acf3273a6
--- /dev/null
+++ b/docs/source/ios-examples.md
@@ -0,0 +1,4 @@
+# Examples & Demos
+
+- [iOS LLM Examples Repository](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple)
+- [MobileViT Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
diff --git a/docs/source/ios-mps.md b/docs/source/ios-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/ios-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/ios-section.md b/docs/source/ios-section.md
new file mode 100644
index 00000000000..33c9a61ce1d
--- /dev/null
+++ b/docs/source/ios-section.md
@@ -0,0 +1,23 @@
+(ios-section)=
+# iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-ios` — Complete iOS integration guide
+
+## Backends
+
+- {doc}`ios-backends` — Available iOS backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`ios-examples` — Explore iOS Examples & Demos
+
+
+```{toctree}
+:hidden:
+using-executorch-ios
+ios-backends
+ios-examples
diff --git a/docs/source/ios-xnnpack.md b/docs/source/ios-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/ios-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/kernel-library-advanced.md b/docs/source/kernel-library-advanced.md
new file mode 100644
index 00000000000..5f0215b87c1
--- /dev/null
+++ b/docs/source/kernel-library-advanced.md
@@ -0,0 +1,23 @@
+(kernel-library-advanced)=
+
+# Kernel Library Deep Dive
+
+Advanced kernel implementation and customization for ExecuTorch.
+
+## Kernel Library Overview
+
+- {doc}`kernel-library-overview` — Architecture and design of the kernel library
+
+- {doc}`kernel-library-custom-aten-kernel` — Kernel registration and customization
+
+## Build Optimization
+
+- {doc}`kernel-library-selective-build` — Selective build for reduced binary footprint
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+kernel-library-overview
+kernel-library-custom-aten-kernel
+kernel-library-selective-build
diff --git a/docs/source/kernel-library-overview.md b/docs/source/kernel-library-overview.md
index cfd46524097..a826b334ba4 100644
--- a/docs/source/kernel-library-overview.md
+++ b/docs/source/kernel-library-overview.md
@@ -1,7 +1,7 @@
-This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
-
 # Overview of ExecuTorch’s Kernel Libraries
 
+This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
+
 An ExecuTorch program encodes instructions that describe the computation that should be performed by the program. Many of these instructions will correspond to calling a specific ATen operator, for example `aten.convolution`. However, one of the core design principles of ExecuTorch is that the signature of an operator should be separate from the implementation of the operator. This means that the ExecuTorch runtime does not ship with any standard implementation for ATen operators; users must make sure to link against kernel libraries that contain implementations of the operators required by their ExecuTorch program, and configure [operator registration](kernel-library-custom-aten-kernel.md) to map an operator signature to the desired implementation. This makes it easy to adjust the implementation of operators such as `aten.convolution` that will be called when executing an ExecuTorch program; it allows users to select the exact operator implementations that will meet the unique performance, memory usage, battery usage, etc. constraints of their use-case.
 
 **In essence, a kernel library is simply a collection of ATen operator implementations that follow a common theme or design principle**. Note that due to ExecuTorch’s selective build process (discussed in the following section), operator implementations are linked individually. This means that users can easily mix different kernel libraries in their build without sacrificing build size.
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 7d6495656a2..666206acb94 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -65,7 +65,7 @@ gen_selected_ops(
 )
 ```
 
-The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/BujSet/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction. 
+The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/pytorch/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction.
 
 ### Select all ops
 
@@ -83,7 +83,7 @@ This API lets users pass in a list of operator names. Note that this API can be
 
 ### Select ops from model
 
-This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model. 
+This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model.
 
 ### Dtype Selective Build
 
@@ -91,7 +91,7 @@ Beyond pruning the binary to remove unused operators, the binary size can furthe
 
 ## Example Walkthrough
 
-In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L48-L72), we have the following cmake config options:
+In [examples/selective_build/CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt), we have the following cmake config options:
 
 1. `EXECUTORCH_SELECT_OPS_YAML`
 2. `EXECUTORCH_SELECT_OPS_LIST`
@@ -99,10 +99,10 @@ In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorc
 4. `EXECUTORCH_SELECT_OPS_FROM_MODEL`
 5. `EXECUTORCH_DTYPE_SELECTIVE_BUILD`
 
-These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L110-L123). The following table describes some examples of how the invocation changes when these configs are set:
+These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt). The following table describes some examples of how the invocation changes when these configs are set:
 
 | Example cmake Call | Resultant `gen_selected_ops` Invocation |
-| :----: | :---:| 
+| :----: | :---:|
 |<code><br>  cmake -D… -DSELECT_OPS_LIST="aten::add.out,aten::mm.out" <br></code> | <code><br>  gen_selected_ops("" "${SELECT_OPS_LIST}" "" "" "") <br></code> |
 |<code><br> cmake -D… -DSELECT_OPS_YAML=ON <br></code> | <code><br>  set(_custom_ops_yaml ${EXECUTORCH_ROOT}/examples/portable/custom_ops/custom_ops.yaml) <br> gen_selected_ops("${_custom_ops_yaml}" "" "") <br></code> |
 |<code><br> cmake -D… -DEXECUTORCH_SELECT_OPS_FROM_MODEL="model.pte.out" <br></code> | <code><br> gen_selected_ops("" "" "" "${_model_path}" "") <br></code> |
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index 4587589a51b..ae1b4f15c99 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -1,6 +1,7 @@
-# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend
+# Run Llama 3 3B Instruct on Android (with Qualcomm AI Engine Direct Backend)
 
-This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device.
+This tutorial demonstrates how to export and run the Llama 3 3B Instruct model on a Qualcomm device using the Qualcomm AI Engine Direct Backend via ExecuTorch.
+We use a static Llama [implementation](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/model/static_llama.py) to optimize performance and memory usage during on-device inference.
 
 ## Prerequisites
 
@@ -13,10 +14,8 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng
 
 ## Instructions
 
-### Step 1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant)
-
-1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
-2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**.
+### Step 1: Prepare the checkpoint and tokenizer of the model.
+1. For Llama 3 tokenizer and checkpoint, please refer to [instructions](https://www.llama.com/models/llama-3) for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
 
 ### Step 2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend
 Deploying large language models like Llama 3 on-device presents the following challenges:
@@ -25,122 +24,79 @@ Deploying large language models like Llama 3 on-device presents the following ch
 2. High model loading and inference time.
 3. Difficulty in quantization.
 
-To address these challenges, we have implemented the following solutions:
-1. Using `quantization.pt2e_quantize = "qnn_16a4w'` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
-2. Using `backed.qnn.num_sharding = 8` to shard the model into sub-parts.
-3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
-4. Using `backend.qnn.optimized_rotation_path = "<path_to_optimized_matrix>"` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
-5. Using `quantization.calibration_data = "<|start_header_id|>system<|end_header_id|..."` to ensure that during quantization, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/).
+To address these, we apply the following optimizations:
+
+1. Quantization: Use `QuantDtype.use_16a4w_block` for post-training quantization to reduce model size and memory usage.
+
+2. Mixed Precision Quantization: compresses KV cache tensors to 8-bit and applies `QuantDtype.use_16a8w` to the LM head.
+
+3. Model Sharding: Set `num_sharding` = 4 to shard the model into sub-parts. This helps reduce memory pressure and improve performance during on-device inference. The number of shards might be different depending on the model size.
+
+4. Graph Transformations: Convert operations into accelerator-friendly formats for better runtime performance.
+
+You can find the full optimization configuration in this [file](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/__init__.py), as shown below:
+
+``` python
+@register_llm_model("llama3_2-3b_instruct")
+@dataclass(init=False, frozen=True)
+class Llama3_2_3B_Instruct(LLMModelConfig):
+    repo_id = None
+    params_path = None
+    convert_weights = None
+    transform_weight = True
+    # The Llama3_2 enabled should be instruct, however, Llama's tokenizer does not provide utility to apply chat template.
+    instruct_model = False
+
+    num_sharding = 4
+    # quant config
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 32  # Group size used in block quantization for weight quantization. Will only be used when ptq = 16a4w_block
+    masked_softmax = False
+  
+    # SeqMSE Quantization: optimizes the parameter encodings of each layer of a model individually to minimize the difference between the layer’s original and quantized outputs. (Implementation details: ./backends/qualcomm/_passes/seq_mse.py) In this configuration, we set `seq_mse_candidates` = 0, which means SeqMSE quantization is not applied.
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    custom_annotation = (
+        annotate_kv_8bit,
+        annotate_output_16a8w,
+    )
+```
+
 
 To export with the Qualcomm AI Engine Direct Backend, ensure the following:
 
-1. The host machine has more than 100GB of memory (RAM + swap space).
+1. The host machine has more than 64GB of memory (RAM + swap space).
 2. The entire process takes a few hours.
 
 ```bash
-# path/to/config.yaml
-base:
-  model_class: llama3
-  checkpoint: path/to/consolidated.00.pth
-  params: path/to/params.json
-  tokenizer_path: path/to/tokenizer.model
-  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
-model:
-  use_kv_cache: True
-  enable_dynamic_shape: False
-quantization:
-  pt2e_quantize: qnn_16a4w
-  # Please note that calibration_data must include the prompt template for special tokens.
-  calibration_data: "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-backend:
-  qnn:
-    enabled: True
-    num_sharding: 8
-    
-
-# export_llm
-python -m extension.llm.export.export_llm \
-  --config path/to/config.yaml
+# export llama
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --compile_only
 ```
+Note: end-to-end [instructions](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md)
 
 ### Step 3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs
-1. Build executorch with Qualcomm AI Engine Direct Backend for android
-    ```bash
-    cmake \
-        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \
-        -DANDROID_ABI=arm64-v8a \
-        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-        -DEXECUTORCH_BUILD_QNN=ON \
-        -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -Bcmake-android-out .
-
-    cmake --build cmake-android-out -j16 --target install --config Release
-    ```
-2. Build llama runner for android
-```bash
-    cmake \
-        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake  \
-        -DANDROID_ABI=arm64-v8a \
-        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-        -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
-        -DEXECUTORCH_BUILD_QNN=ON \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -Bcmake-android-out/examples/models/llama examples/models/llama
-
-    cmake --build cmake-android-out/examples/models/llama -j16 --config Release
-```
-3. Run on Android via adb shell
-*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone
-
 **3.1 Connect your android phone**
 
-**3.2 We need to push required QNN libraries to the device.**
-```bash
-# make sure you have write-permission on below path.
-DEVICE_DIR=/data/local/tmp/llama
-adb shell mkdir -p ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
-```
-
-**3.3 Upload model, tokenizer and llama runner binary to phone**
-```bash
-adb push <model.pte> ${DEVICE_DIR}
-adb push <tokenizer.model> ${DEVICE_DIR}
-adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
-adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR}
-```
+**3.2 Make sure the following artifact is present before running the model.**
+-- artifact/
+   └── llama_qnn.pte
 
-**3.4 Run model**
+**3.3 Run model**
 ```bash
-adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.model> --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128"
-```
-You should see the message:
-```
-<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the
+# Run llama
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --pre_gen_pte ${PATH_TO_ARTIFACT}
 ```
 
 ## What is coming?
 - Performance improvements
 - Reduce the memory pressure during inference to support 12GB Qualcomm devices
-- Support more LLMs (Qwen, Phi-4-mini, etc.)
+- Broader LLM Support via [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch?tab=readme-ov-file#llms-large-language-models)
+
+  - Already supported models (e.g.): Llama2, Llama3, Gemma, Qwen, Phi-4, SmolLM. For usage examples, please refer to [README](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md)
 
 ## FAQ
 
 If you encounter any issues while reproducing the tutorial, please file a github
-issue on ExecuTorch repo and tag use `#qcom_aisw` tag
+[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
\ No newline at end of file
diff --git a/docs/source/llm/export-custom-llm.md b/docs/source/llm/export-custom-llm.md
index 57537ba31d8..4797f773fa3 100644
--- a/docs/source/llm/export-custom-llm.md
+++ b/docs/source/llm/export-custom-llm.md
@@ -81,7 +81,7 @@ with open("nanogpt.pte", "wb") as file:
 
 To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory.
 
-For more information, see [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) and
+For more information, see [Exporting to ExecuTorch](../tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> and
 [torch.export](https://pytorch.org/docs/stable/export.html).
 
 ## Backend delegation
@@ -143,7 +143,7 @@ example_inputs = (
 # long as they adhere to the rules specified in the dynamic shape configuration.
 # Here we set the range of 0th model input's 1st dimension as
 # [0, model.config.block_size].
-# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# See ../concepts.html#dynamic-shapes
 # for details about creating dynamic shapes.
 dynamic_shape = (
     {1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
diff --git a/docs/source/llm/export-llm.md b/docs/source/llm/export-llm.md
index e9f8307f2c3..082b8c2b18d 100644
--- a/docs/source/llm/export-llm.md
+++ b/docs/source/llm/export-llm.md
@@ -78,7 +78,7 @@ python -m extension.llm.export.export_llm \
 - `use_shared_embedding` can help for models with tied input/output embedding layers, given that you quantize using TorchAO low bit ops (`quantization.qmode: torchao:8da(\\d+)w` or `quantization.qmode: torchao:fpa(\d+)w`), see more [here](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L307).
 - `use_attention_sink` to extend generation by removing from the beginning of the KV cache when the max context length is reached.
 - `quantize_kv_cache` quantizes the KV cache in int8.
-- `local_global_attention` impements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache.
+- `local_global_attention` implements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache.
 
 ## Quantization
 Quantization options are defined by [`QuantizationConfig`](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L283). ExecuTorch does quantization in two ways:
@@ -92,7 +92,7 @@ The quantization modes are defined [here](https://github.com/pytorch/executorch/
 
 Common ones to use are:
 - `8da4w`: short for int8 dynamic activation + int4 weight quantization.
-- `int8`: int8 weight-only quanziation.
+- `int8`: int8 weight-only quantization.
 
 Group size is specified with:
 - `group_size`: 8, 32, 64, etc.
diff --git a/docs/source/llm/run-with-c-plus-plus.md b/docs/source/llm/run-with-c-plus-plus.md
index f987fcab2a5..217afad847b 100644
--- a/docs/source/llm/run-with-c-plus-plus.md
+++ b/docs/source/llm/run-with-c-plus-plus.md
@@ -10,7 +10,7 @@ Before you begin, make sure you have:
    - Please also see [Model Metadata](#model-metadata) section for important metadata to be serialized into `.pte`.
 2. A tokenizer file compatible with your model
    - For HuggingFace tokenizers, this is a JSON file `tokenizer.json`
-   - For SentencePiece tokenizers, this is is a `tokenizer.model` file and normally live alongside the weights file
+   - For SentencePiece tokenizers, this is a `tokenizer.model` file and normally lives alongside the weights file
 3. CMake and a C++ compiler installed
    - CMake version 3.29 or higher
    - g++ or clang compiler
diff --git a/docs/source/llm/working-with-llms.md b/docs/source/llm/working-with-llms.md
index 17b2e46c0a5..4c238f7ae5c 100644
--- a/docs/source/llm/working-with-llms.md
+++ b/docs/source/llm/working-with-llms.md
@@ -1,13 +1,18 @@
-# Working with LLMs
+(working-with-llms)=
+
+# LLMs
+
+Learn how to export LLM models and deploy them across different platforms and runtime environments. This section covers the complete workflow from model export to running inference on mobile devices and edge hardware.
+
 
 ```{toctree}
 :maxdepth: 1
+:caption: Working with LLMs
 
 getting-started
 export-llm
 export-custom-llm
 run-with-c-plus-plus
-llama-demo-android
 build-run-llama3-qualcomm-ai-engine-direct-backend
 run-on-ios
 ```
diff --git a/docs/source/platforms-desktop.md b/docs/source/platforms-desktop.md
new file mode 100644
index 00000000000..acbdb06a6b6
--- /dev/null
+++ b/docs/source/platforms-desktop.md
@@ -0,0 +1,23 @@
+# Desktop & Laptop
+
+ExecuTorch supports desktop and laptop deployment across Linux, macOS, and Windows.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Platform
+
+### Linux
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
+- [ARM Ethos-U (ARM64)](backends-arm-ethos-u)
+
+### macOS
+- [CoreML (recommended)](backends-coreml)
+- [MPS (Apple Silicon)](backends-mps)
+- [XNNPACK (CPU)](backends-xnnpack)
+
+### Windows
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
diff --git a/docs/source/platforms-embedded.md b/docs/source/platforms-embedded.md
new file mode 100644
index 00000000000..5ea248fc0d9
--- /dev/null
+++ b/docs/source/platforms-embedded.md
@@ -0,0 +1,19 @@
+# Embedded Platforms
+
+ExecuTorch supports embedded devices from microcontrollers to edge devices.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Device Type
+
+### Microcontrollers
+- [Cadence Xtensa Backend](backends-cadence)
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [Custom Backend Development](backend-delegates-integration)
+
+### Edge Devices
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [NXP eIQ Neutron Backend](backend-nxp)
+- [Custom Hardware Integration](backend-delegates-integration)
diff --git a/docs/source/quantization-optimization.md b/docs/source/quantization-optimization.md
new file mode 100644
index 00000000000..d2005b3adac
--- /dev/null
+++ b/docs/source/quantization-optimization.md
@@ -0,0 +1,20 @@
+(quantization-optimization)=
+
+# Quantization & Optimization
+
+Advanced techniques for model compression and performance optimization.
+
+## Quantization Strategies
+
+- {doc}`quantization-overview` — Comprehensive quantization strategies and techniques
+
+## Performance Optimization
+
+- {doc}`runtime-profiling` — Performance profiling and optimization techniques
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+quantization-overview
+runtime-profiling
diff --git a/docs/source/quantization-overview.md b/docs/source/quantization-overview.md
index fdceee80e8e..4ff8d34a4a8 100644
--- a/docs/source/quantization-overview.md
+++ b/docs/source/quantization-overview.md
@@ -14,7 +14,7 @@ Quantization in ExecuTorch is backend-specific. Each backend defines how models
 The PT2E quantization workflow has three main steps:
 
 1. Configure a backend-specific quantizer.
-2. Prepare, calibrate, convert, and evalute the quantized model in PyTorch
+2. Prepare, calibrate, convert, and evaluate the quantized model in PyTorch
 3. Lower the model to the target backend
 
 ## 1. Configure a Backend-Specific Quantizer
diff --git a/docs/source/quick-start-section.md b/docs/source/quick-start-section.md
new file mode 100644
index 00000000000..b35bed8d22c
--- /dev/null
+++ b/docs/source/quick-start-section.md
@@ -0,0 +1,38 @@
+(quick-start-section)=
+# Quick Start
+
+Get started with ExecuTorch in just a few steps.
+
+This section walks you through the essential steps to get ExecuTorch up and running, from initial setup to exporting your first model for edge deployment.
+
+## What You'll Learn
+
+Follow these guides in order to get started with ExecuTorch:
+
+- **{doc}`getting-started`** - Initial Setup: Set up your development environment and run your first ExecuTorch example.
+
+- **{doc}`using-executorch-export`** - Exporting your model: Export for Edge deployment.
+
+- **{doc}`using-executorch-building-from-source`** - Building from Source: Build ExecuTorch from source for custom configurations and development.
+
+## Prerequisites
+
+- Python 3.10-3.12
+- PyTorch 2.9+
+- Basic familiarity with PyTorch model development
+
+## Next Steps
+
+After completing the quick start, explore:
+
+- **{doc}`edge-platforms-section`** - Deploy to specific platforms (Android, iOS, Desktop, Embedded)
+- **{doc}`backends-section`** - Choose the right acceleration backend for your hardware
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Quick Start Guide
+
+getting-started
+using-executorch-export
+using-executorch-building-from-source
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
index a12ef122bc8..5ae4235995d 100644
--- a/docs/source/running-a-model-cpp-tutorial.md
+++ b/docs/source/running-a-model-cpp-tutorial.md
@@ -6,13 +6,13 @@ In this tutorial, we will cover how to run an ExecuTorch model in C++ using the
 
 For a high level overview of the ExecuTorch Runtime please see [Runtime Overview](runtime-overview.md), and for more in-depth documentation on
 each API please see the [Runtime API Reference](executorch-runtime-api-reference.rst).
-[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.md) doc shows how to build and run it.
+[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.rst) doc shows how to build and run it.
 
 
 ## Prerequisites
 
 You will need an ExecuTorch model to follow along. We will be using
-the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 
 ## Model Loading
 
@@ -96,7 +96,7 @@ MemoryManager memory_manager(&method_allocator, &planned_memory);
 
 ## Loading a Method
 
-In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to intializing delegates, etc.
+In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to initializing delegates, etc.
 
 ``` cpp
 Result<Method> method = program->load_method(method_name);
diff --git a/docs/source/runtime-integration-advanced.md b/docs/source/runtime-integration-advanced.md
new file mode 100644
index 00000000000..a76265c4093
--- /dev/null
+++ b/docs/source/runtime-integration-advanced.md
@@ -0,0 +1,20 @@
+(runtime-integration-advanced)=
+
+# Runtime & Integration
+
+Advanced runtime integration topics
+
+## Platform Integration
+
+- {doc}`runtime-platform-abstraction-layer` — Platform abstraction layer for cross-platform deployment
+
+## Portable C++ Programming
+
+- {doc}`portable-cpp-programming` — Portable C++ programming for cross-platform deployment
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+runtime-platform-abstraction-layer
+portable-cpp-programming
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 96a618a2a41..1df3da40478 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -11,7 +11,7 @@ Works](intro-how-it-works.md).
 At the highest level, the ExecuTorch runtime is responsible for:
 
 * Loading binary `.pte` program files that were generated by the
-  [`to_executorch()`](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) step of the
+  [`to_executorch()`](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> step of the
   model-lowering process.
 * Executing the series of instructions that implement a lowered model.
 
diff --git a/docs/source/runtime-profiling.md b/docs/source/runtime-profiling.md
index 120d31954fd..56b62de599d 100644
--- a/docs/source/runtime-profiling.md
+++ b/docs/source/runtime-profiling.md
@@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](model
     - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level.
 
 
-Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a step-by-step walkthrough of the above process on a sample model.
+Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for a step-by-step walkthrough of the above process on a sample model.
diff --git a/docs/source/success-stories.md b/docs/source/success-stories.md
new file mode 100644
index 00000000000..cba874132c6
--- /dev/null
+++ b/docs/source/success-stories.md
@@ -0,0 +1,56 @@
+(success-stories)=
+
+# Success Stories
+
+Discover how organizations are leveraging ExecuTorch to deploy AI models at scale on edge devices.
+
+---
+
+## 🎯 Featured Success Stories
+
+::::{grid} 1
+:gutter: 3
+
+:::{grid-item-card} **🚀 Story 1: [Title Placeholder]**
+:class-header: bg-primary text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-1-details)
+:::
+
+:::{grid-item-card} **⚡ Story 2: [Title Placeholder]**
+:class-header: bg-success text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+
+[Read Full Story →](#story-2-details)
+:::
+
+:::{grid-item-card} **🧠 Story 3: [Title Placeholder]**
+:class-header: bg-info text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-3-details)
+:::
+
+::::
+
+---
diff --git a/docs/source/support-section.md b/docs/source/support-section.md
new file mode 100644
index 00000000000..64c47a3e55b
--- /dev/null
+++ b/docs/source/support-section.md
@@ -0,0 +1,17 @@
+(support-section)=
+# Support
+
+In this section, find answers to common questions, troubleshooting guides, and information on how to contribute to the ExecuTorch project. Get help with issues and learn how to participate in the community.
+
+- {doc}`using-executorch-faqs` — FAQ
+- {doc}`using-executorch-troubleshooting` — Common Issues
+- {doc}`contributing` — Contributing
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Support
+
+using-executorch-faqs
+using-executorch-troubleshooting
+contributing
diff --git a/docs/source/tools-section.md b/docs/source/tools-section.md
new file mode 100644
index 00000000000..461a1f6849a
--- /dev/null
+++ b/docs/source/tools-section.md
@@ -0,0 +1,30 @@
+(tools-sdk-section)=
+
+# Tools
+
+In this section, explore ExecuTorch's comprehensive developer tools for profiling, debugging, and model inspection. These tools help optimize performance and troubleshoot issues during development and deployment.
+
+- {doc}`devtools-overview` — Developer Tools Overview
+- {doc}`bundled-io` — Bundled I/O
+- {doc}`etrecord` — ETRecord
+- {doc}`etdump` — ETDump
+- {doc}`runtime-profiling` — Profiling Suite
+- {doc}`model-debugging` — Debugging Tools
+- {doc}`model-inspector` — Model Inspector
+- {doc}`memory-planning-inspection` — Memory Planning Inspection
+- {doc}`devtools-tutorial` — Development Utilities
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Tools
+
+devtools-overview
+bundled-io
+etrecord
+etdump
+runtime-profiling
+model-debugging
+model-inspector
+memory-planning-inspection
+devtools-tutorial
diff --git a/docs/source/tutorial-arm-ethos-u.md b/docs/source/tutorial-arm-ethos-u.md
index 8fc4299cbb9..0c713e996f8 100644
--- a/docs/source/tutorial-arm-ethos-u.md
+++ b/docs/source/tutorial-arm-ethos-u.md
@@ -17,12 +17,6 @@ In this tutorial you will learn how to export a simple PyTorch model for the Exe
 
 ::::
 
-```{warning}
-This delegate is under active development, to get best results please use a recent version.
-The TOSA and Ethos-U backend support is reasonably mature and used in production by some users.
-You may encounter some rough edges and features which may be documented or planned but not implemented, please refer to the in-tree documentation for the latest status of features.
-```
-
 ```{tip}
 If you are already familiar with this delegate, you may want to jump directly to the examples:
 * [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
@@ -217,4 +211,4 @@ If you encountered any bugs or issues following this tutorial please file a bug/
 
 ```
 Arm is a registered trademark of Arm Limited (or its subsidiaries or affiliates).
-```
\ No newline at end of file
+```
diff --git a/docs/source/tutorial-arm-vgf.md b/docs/source/tutorial-arm-vgf.md
index a29c2ada6e9..0e34e4be4b6 100644
--- a/docs/source/tutorial-arm-vgf.md
+++ b/docs/source/tutorial-arm-vgf.md
@@ -171,6 +171,7 @@ cmake \
   -DCMAKE_BUILD_TYPE=Debug \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
   -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
@@ -192,7 +193,7 @@ The block diagram below demonstrates, at the high level, how the various build a
 
 ## Deploying and running on device
 
-Since we are using the Vulkan emulation layer, we can run the the executor runner with the VGF delegate on the host machine:
+Since we are using the Vulkan emulation layer, we can run the executor runner with the VGF delegate on the host machine:
 
 ```bash
 ./cmake-out/executor_runner -model_path simple_example.pte
diff --git a/docs/source/tutorial-template.md b/docs/source/tutorial-template.md
index b25731afa17..73b787c9e2c 100644
--- a/docs/source/tutorial-template.md
+++ b/docs/source/tutorial-template.md
@@ -9,12 +9,12 @@
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
 * [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [Setting up ExecuTorch](getting-started-setup.rst)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 ::::
 
-## Prerequsites (Hardware and Software)
+## Prerequisites (Hardware and Software)
 
 Provide instructions on what kind of hardware and software are pre-requisite for the tutorial.
 
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index bccd4e4add3..3fb079f24d6 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -11,7 +11,7 @@ In this tutorial, you will learn how to export an XNNPACK lowered Model and run
 :::{grid-item-card}  Before you begin it is recommended you go through the following:
 :class-card: card-prerequisites
 * [Setting up ExecuTorch](getting-started-setup.rst)
-* [Model Lowering Tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
+* [Model Lowering Tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->
 * [ExecuTorch XNNPACK Delegate](backends-xnnpack.md)
 :::
 ::::
@@ -74,7 +74,7 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and
 
 
 ## Lowering a Quantized Model to XNNPACK
-The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
+The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Quantization Overview](quantization-overview.md). For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
 
 ```python
 from torch.export import export
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
index 4b388460c87..ce9977218a1 100644
--- a/docs/source/using-executorch-android.md
+++ b/docs/source/using-executorch-android.md
@@ -72,7 +72,7 @@ curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250
 curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar.sha256sums
 ```
 
-We aim to make every daily snapshot available and useable. However, for best stability, please use releases, not snapshots.
+We aim to make every daily snapshot available and usable. However, for best stability, please use releases, not snapshots.
 
 ## Using AAR file
 
diff --git a/docs/source/using-executorch-cpp.md b/docs/source/using-executorch-cpp.md
index 3736226bc06..5505ade9573 100644
--- a/docs/source/using-executorch-cpp.md
+++ b/docs/source/using-executorch-cpp.md
@@ -69,7 +69,7 @@ The runner source code can be found in the ExecuTorch repo under [examples/porta
 
 ## Next Steps
 
-- [Runtime API Reference](executorch-runtime-api-reference.md) for documentation on the available C++ runtime APIs.
+- [Runtime API Reference](executorch-runtime-api-reference.rst) for documentation on the available C++ runtime APIs.
 - [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md) for information on the high-level Module API.
 - [Managing Tensor Memory in C++](extension-tensor.md) for information on high-level tensor APIs.
 - [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md) for information on the low-level runtime APIs.
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
index b3d1836b78a..7abf5cbd30a 100644
--- a/docs/source/using-executorch-export.md
+++ b/docs/source/using-executorch-export.md
@@ -24,7 +24,7 @@ Quantization - the process of using reduced precision to reduce inference time a
 
 ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
 
-The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requirements and level of model support. See the documentation for each hardware backend for more details.
 
 As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
 
@@ -32,7 +32,7 @@ As part of the .pte file creation process, ExecuTorch identifies portions of the
 
 Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation and the [Export and Lowering](#export-and-lowering) section below for more information.
 
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
+- [XNNPACK (CPU)](backends-xnnpack.md)
 - [Core ML (iOS)](backends-coreml.md)
 - [Metal Performance Shaders (iOS GPU)](backends-mps.md)
 - [Vulkan (Android GPU)](backends-vulkan.md)
@@ -206,7 +206,7 @@ outputs = module.forward([input_tensor])
 
 There is also an E2E demo in [executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/program-data-separation).
 
-For more information, see [Runtime API Reference](executorch-runtime-api-reference.md).
+For more information, see [Runtime API Reference](executorch-runtime-api-reference.rst).
 
 ## Advanced Topics
 
@@ -280,7 +280,7 @@ decode_ep = torch.export.export(DecodeWrapper(model), ...)
 
 ## Next Steps
 
-The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.md) for more information.
+The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.rst) for more information.
 
 For advanced use cases, see the following:
 - [Quantization Overview](quantization-overview.md) for information on quantizing models to reduce inference time and memory footprint.
diff --git a/docs/source/using-executorch-faqs.md b/docs/source/using-executorch-faqs.md
index 1d90edc6dc2..c147403c9e8 100644
--- a/docs/source/using-executorch-faqs.md
+++ b/docs/source/using-executorch-faqs.md
@@ -48,7 +48,7 @@ Thread count can be set with the following function. Ensure this is done prior t
 ::executorch::extension::threadpool::get_threadpool()->_unsafe_reset_threadpool(num_threads);
 ```
 
-For a deeper investgiation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information.
+For a deeper investigation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information.
 
 ### Missing Logs
 
diff --git a/docs/source/using-executorch-runtime-integration.md b/docs/source/using-executorch-runtime-integration.md
index 550cb3eb71a..36bc4f6b2fe 100644
--- a/docs/source/using-executorch-runtime-integration.md
+++ b/docs/source/using-executorch-runtime-integration.md
@@ -64,7 +64,7 @@ namespace {
 ```
 
 ### Weak Symbol Override
-ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatability.
+ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatibility.
 
 To override one or more PAL methods, take the following steps:
 
diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md
index 56c2e1a0653..75648dc5b46 100644
--- a/docs/source/using-executorch-troubleshooting.md
+++ b/docs/source/using-executorch-troubleshooting.md
@@ -1,11 +1,11 @@
 # Profiling and Debugging
 
-To faciliate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
+To facilitate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
 
 ## General Troubleshooting Steps
 
 - To troubleshoot failure of runtime API calls, such as loading or running a model, ensure that ExecuTorch framework logging is enabled. See [Logging](using-executorch-runtime-integration.md#logging) for more information.
-- As a prelimatinary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
+- As a preliminary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
 - Check [Frequently Asked Questions](using-executorch-faqs.md) for common issues and questions encountered during install, model export, and runtime integration.
 
 ## Developer Tools
@@ -16,5 +16,5 @@ The ExecuTorch developer tools, or devtools, are a collection of tooling for tro
 
 - [Frequently Asked Questions](using-executorch-faqs.md) for solutions to commonly encountered questions and issues.
 - [Introduction to the ExecuTorch Developer Tools](runtime-profiling.md) for a high-level introduction to available developer tooling.
-- [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for information on runtime performance profiling.
+- [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for information on runtime performance profiling.
 - [Inspector APIs](runtime-profiling.md) for reference material on trace inspector APIs.
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 53020d1bea0..34ed7e3f1bd 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -18,23 +18,18 @@
 import torch
 from examples.devtools.scripts.export_bundled_program import save_bundled_program
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-    VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
 
 from executorch.backends.arm.util.arm_model_evaluator import (
     evaluate_model,
     evaluator_calibration_data,
 )
 
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec
 
 # To use Cortex-M backend
 from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
@@ -61,6 +56,8 @@
 
 from executorch.extension.export_util.utils import save_pte_program
 from tabulate import tabulate
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
 from torch.utils.data import DataLoader
 
 # Quantize model if required using the standard export quantizaion flow.
@@ -145,26 +142,19 @@ def get_model_and_inputs_from_name(
 
 
 def quantize(
-    model: torch.nn.Module,
+    model: GraphModule,
     model_name: str,
     compile_specs: EthosUCompileSpec | VgfCompileSpec | TosaCompileSpec,
     example_inputs: Tuple[torch.Tensor],
     evaluator_name: str | None,
     evaluator_config: Dict[str, Any] | None,
-) -> torch.nn.Module:
+) -> GraphModule:
     """This is the official recommended flow for quantization in pytorch 2.0
     export"""
     logging.info("Quantizing Model...")
     logging.debug(f"Original model: {model}")
-    quantizer = None
-    if isinstance(compile_specs, EthosUCompileSpec):
-        quantizer = EthosUQuantizer(compile_specs)
-    elif isinstance(compile_specs, TosaCompileSpec):
-        quantizer = TOSAQuantizer(compile_specs)
-    elif isinstance(compile_specs, VgfCompileSpec):
-        quantizer = VgfQuantizer(compile_specs)
-    else:
-        raise RuntimeError("Unsupported compilespecs for quantization!")
+
+    quantizer = create_quantizer(compile_specs)
 
     operator_config = get_symmetric_quantization_config()
     quantizer.set_global(operator_config)
@@ -295,7 +285,9 @@ def get_calibration_data(
 ):
     # Firstly, if the model is being evaluated, take the evaluators calibration function if it has one
     if evaluator_name is not None:
-        return evaluator_calibration_data(evaluator_name, evaluator_config)
+        evaluator_data = evaluator_calibration_data(evaluator_name, evaluator_config)
+        if evaluator_data is not None:
+            return evaluator_data
 
     # If the model is in the calibration_data dictionary, get the data from there
     # This is used for the simple model examples provided
@@ -418,7 +410,7 @@ def get_args():
         required=False,
         nargs="?",
         const="generic",
-        choices=["generic", "mv2"],
+        choices=["generic", "mv2", "deit_tiny"],
         help="Flag for running evaluation of the model.",
     )
     parser.add_argument(
@@ -601,7 +593,12 @@ def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: s
     save_bundled_program(exec_prog, method_test_suites, output_name)
 
 
-def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
+def quantize_model(
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+    compile_spec,
+) -> Tuple[GraphModule, ExportedProgram]:
     model_int8 = quantize(
         model,
         args.model_name,
@@ -619,7 +616,10 @@ def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
 
 
 def to_edge_TOSA_delegate(
-    exported_program, args, model: torch.nn.Module, example_inputs
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
 ):
     # As we can target multiple output encodings, one must
     # be specified.
@@ -638,16 +638,8 @@ def to_edge_TOSA_delegate(
         model_int8, exported_program = quantize_model(
             args, model, example_inputs, compile_spec
         )
-        model = model_int8
-
-    if isinstance(compile_spec, EthosUCompileSpec):
-        partitioner = EthosUPartitioner(compile_spec)
-    elif isinstance(compile_spec, TosaCompileSpec):
-        partitioner = TOSAPartitioner(compile_spec)
-    elif isinstance(compile_spec, VgfCompileSpec):
-        partitioner = VgfPartitioner(compile_spec)
-    else:
-        raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
+
+    partitioner = create_partitioner(compile_spec)
 
     edge = to_edge_transform_and_lower(
         exported_program,
@@ -660,7 +652,12 @@ def to_edge_TOSA_delegate(
     return model_int8, edge
 
 
-def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_inputs):
+def to_edge_no_delegate(
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+):
     model_int8 = None
     if args.quantize:
         # As we can target multiple output encodings, one must
diff --git a/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
new file mode 100644
index 00000000000..11590a8578f
--- /dev/null
+++ b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
@@ -0,0 +1,25 @@
+From f6a7d867212336b3e344c21240a2a03671bffd65 Mon Sep 17 00:00:00 2001
+From: Per Held <per.held@arm.com>
+Date: Wed, 17 Sep 2025 13:46:05 +0200
+Subject: Remove hello_world from applications
+
+---
+ applications/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
+index a017575..130f0f7 100644
+--- a/applications/CMakeLists.txt
++++ b/applications/CMakeLists.txt
+@@ -21,7 +21,7 @@ add_subdirectory(driver_unit_tests)
+ 
+ add_subdirectory(freertos)
+ 
+-add_subdirectory(hello_world)
++#add_subdirectory(hello_world)
+ 
+ add_subdirectory(threadx_demo)
+ 
+-- 
+2.43.0
+
diff --git a/examples/arm/vgf_minimal_example.ipynb b/examples/arm/vgf_minimal_example.ipynb
index 4589745e8e7..1f8e0a61601 100644
--- a/examples/arm/vgf_minimal_example.ipynb
+++ b/examples/arm/vgf_minimal_example.ipynb
@@ -240,6 +240,7 @@
     "  -DCMAKE_BUILD_TYPE=Debug \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \\\n",
+    "  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \\\n",
     "  -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \\\n",
diff --git a/examples/cuda/scripts/__init__.py b/examples/cuda/scripts/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/cuda/scripts/export.py b/examples/cuda/scripts/export.py
new file mode 100644
index 00000000000..c103d7ee50a
--- /dev/null
+++ b/examples/cuda/scripts/export.py
@@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer with CUDA delegate.
+
+import argparse
+import pathlib
+
+import torch
+
+from executorch.backends.cuda.cuda_backend import CudaBackend
+
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+
+from executorch.examples.models import MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
+
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+
+from executorch.extension.export_util.utils import save_pte_program
+from torch._inductor.decomposition import conv1d_to_conv2d
+from torch.nn.attention import SDPBackend
+
+# Script to export a model with CUDA delegation.
+
+_EDGE_COMPILE_CONFIG = EdgeCompileConfig(
+    _check_ir_validity=False,
+    _skip_dim_order=True,  # TODO(T182928844): enable dim_order in backend
+)
+
+
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        required=True,
+        help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=pathlib.Path,
+        default=pathlib.Path("./"),
+        help="Output directory for the exported model",
+    )
+    parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction)
+    parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction)
+
+    args = parser.parse_args()
+    return args
+
+
+def save_processed_bytes(processed_bytes, base_name: str):
+    filename = f"{base_name}.bin"
+    print(f"Saving processed bytes to {filename}")
+    with open(filename, "wb") as file:
+        file.write(processed_bytes)
+    return
+
+
+def main():
+    args = parse_args()
+
+    if args.model_name not in MODEL_NAME_TO_MODEL:
+        raise RuntimeError(
+            f"Model {args.model_name} is not a valid name. "
+            f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+        )
+
+    (
+        model,
+        example_args,
+        example_kwargs,
+        dynamic_shapes,
+    ) = EagerModelFactory.create_model(*MODEL_NAME_TO_MODEL[args.model_name])
+    model = model.eval()
+    exported_programs = torch.export.export(
+        model,
+        args=example_args,
+        kwargs=example_kwargs,
+        dynamic_shapes=dynamic_shapes,
+    )
+    print(exported_programs)
+
+    partitioner = CudaPartitioner(
+        [CudaBackend.generate_method_name_compile_spec(args.model_name)]
+    )
+    # Add decompositions for triton to generate kernels.
+    exported_programs = exported_programs.run_decompositions(
+        {
+            torch.ops.aten.conv1d.default: conv1d_to_conv2d,
+        }
+    )
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
+        et_prog = to_edge_transform_and_lower(
+            exported_programs,
+            partitioner=[partitioner],
+            compile_config=_EDGE_COMPILE_CONFIG,
+            generate_etrecord=args.generate_etrecord,
+        )
+    exec_program = et_prog.to_executorch()
+    save_pte_program(exec_program, args.model_name, args.output_dir)
+    if args.generate_etrecord:
+        exec_program.get_etrecord().save(f"{args.model_name}_etrecord.bin")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/gemma/__init__.py b/examples/models/gemma/__init__.py
new file mode 100644
index 00000000000..13a14ff0751
--- /dev/null
+++ b/examples/models/gemma/__init__.py
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.gemma.convert_weights import convert_weights
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class GemmaModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "GemmaModel",
+    "convert_weights",
+]
diff --git a/examples/models/gemma/config/2b_config.json b/examples/models/gemma/config/2b_config.json
new file mode 100644
index 00000000000..20a40723c30
--- /dev/null
+++ b/examples/models/gemma/config/2b_config.json
@@ -0,0 +1,19 @@
+{
+  "dim": 2048,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 16384,
+  "n_heads": 8,
+  "head_dim": 256,
+  "n_kv_heads": 1,
+  "n_layers": 18,
+  "act_fn": "gelu",
+  "norm_type": "gemma3",
+  "norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "use_scaled_rope": false,
+  "apply_embedding": true,
+  "embedding_scale_factor": 45.254833995939045,
+  "vocab_size": 256000,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false
+}
diff --git a/examples/models/gemma/convert_weights.py b/examples/models/gemma/convert_weights.py
new file mode 100644
index 00000000000..09a17bc2266
--- /dev/null
+++ b/examples/models/gemma/convert_weights.py
@@ -0,0 +1,104 @@
+import argparse
+
+import json
+import os
+from typing import Dict
+
+import torch
+from safetensors.torch import load_file
+
+from torchtune.models.convert_weights import get_mapped_key
+
+
+# Weight mappings from Gemma's checkpoint to ExecuTorch's transformer parameters.
+_GEMMA_TO_EXECUTORCH = {
+    "model.embed_tokens.weight": "tok_embeddings.weight",
+    "model.norm.weight": "norm.weight",
+    "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+    "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+    "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+    "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+    "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+    "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+    "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
+    "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+    "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+}
+
+
+def gemma_to_executorch(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert the state dict so that it matches what ExecuTorch's transformer definition expects.
+    """
+    converted_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, _GEMMA_TO_EXECUTORCH)
+        converted_state_dict[new_key] = value
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+    return converted_state_dict
+
+
+def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
+    index_path = os.path.join(input_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        # Sharded checkpoint.
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        checkpoint_shards = sorted(set(weight_map.values()))
+
+        # Load all the shards into memory
+        shard_to_weights = {}
+        for shard in checkpoint_shards:
+            shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+        # Merge tensors into consolidated state dict.
+        merged_state_dict = {}
+        for weight_name, shard in weight_map.items():
+            tensor = shard_to_weights[shard][weight_name]
+            merged_state_dict[weight_name] = tensor
+        return merged_state_dict
+    else:
+        # Single checkpoint.
+        state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+        return state_dict
+
+
+def load_checkpoint(input_dir: str) -> Dict:
+    pytorch_path = os.path.join(input_dir, "pytorch_model.bin")
+    if os.path.exists(pytorch_path):
+        print("Loading checkpoint from PyTorch .bin file")
+        return torch.load(pytorch_path, map_location="cpu", weights_only=True)
+    print("Loading checkpoint from safetensors directory")
+    return load_checkpoint_from_safetensors(input_dir)
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+    print("Loading checkpoint...")
+    sd = load_checkpoint(input_dir)
+    print("Converting checkpoint...")
+    sd = gemma_to_executorch(sd)
+    print("Saving checkpoint...")
+    torch.save(sd, output_file)
+    print("Done.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Gemma weights to ExecuTorch transformer format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing safetensor checkpoint files, or PyTorch checkpoint file.",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index e7c73c0cffc..db0e38b8b74 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -189,6 +189,13 @@ if(TARGET mpsdelegate)
   executorch_target_link_options_shared_lib(mpsdelegate)
 endif()
 
+# Openvino backend
+if(TARGET openvino_backend)
+  find_package(OpenVINO REQUIRED)
+  list(APPEND link_libraries openvino_backend)
+  executorch_target_link_options_shared_lib(openvino_backend)
+endif()
+
 if(TARGET coremldelegate)
   find_library(SQLITE_LIBRARY sqlite3)
   list(
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 5f7f4505c45..0d1728a0c6c 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -94,6 +94,8 @@ Llama 3.2 1B and 3B performance was measured on Android OnePlus 12 device. The p
   </tr>
 </table>
 
+[Please visit this section to try it on OpenVINO backend](../../openvino/llama/README.md).
+
 ## Llama 3/3.1 8B
 Since Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized (PTQ) model.
 
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index 6e3f7cb9fb2..0c0176269b3 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -516,3 +516,18 @@ def forward(
         output = self.wo(output)
 
         return output, None
+
+
+@register_attention("skip")
+class AttentionSkip(Attention):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        **kwargs: ForwardOptions,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        return x, None
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index aa3b157c8da..7fa9357f23b 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -36,12 +36,14 @@
 from executorch.extension.llm.export.partitioner_lib import (
     get_coreml_partitioner,
     get_mps_partitioner,
+    get_openvino_partitioner,
     get_qnn_partitioner,
     get_vulkan_partitioner,
     get_xnnpack_partitioner,
 )
 from executorch.extension.llm.export.quantizer_lib import (
     get_coreml_quantizer,
+    get_ov_quantizer,
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
     get_qnn_quantizer,
@@ -203,6 +205,8 @@ def build_args_parser() -> argparse.ArgumentParser:
         choices=[
             "xnnpack_dynamic",
             "xnnpack_dynamic_qc4",
+            "openvino_4wo",
+            "openvino_8wo",
             "qnn_8a8w",
             "qnn_16a16w",
             "qnn_16a4w",
@@ -471,6 +475,14 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Delegate llama2 to qnn backend (Qualcomm), please use it --kv_cahce=True",
     )
+    parser.add_argument("--openvino", action="store_true")
+    parser.add_argument(
+        "--openvino_device",
+        type=str,
+        default="CPU",
+        choices=["CPU", "GPU", "NPU"],
+        help="Specify the device for Openvino (CPU, GPU or NPU).",
+    )
 
     parser.add_argument(
         "--expand_rope_table",
@@ -781,6 +793,14 @@ def get_quantizer_and_quant_params(llm_config):
             llm_config.quantization.pt2e_quantize.value, llm_config.quantization.qmode
         )
         quantizers.append(qnn_quantizer)
+    if llm_config.backend.openvino.enabled and llm_config.quantization.pt2e_quantize:
+        assert not quantizers, "Should not enable both xnnpack and openvino"
+        group_size = llm_config.quantization.group_size
+        group_size = group_size if group_size else 128
+        ov_quantizer = get_ov_quantizer(
+            llm_config.quantization.pt2e_quantize.value, group_size
+        )
+        quantizers.append(ov_quantizer)
     if llm_config.backend.coreml.enabled and llm_config.quantization.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack / qnn and coreml"
         coreml_quantizer = get_coreml_quantizer(
@@ -887,6 +907,34 @@ def _to_edge_and_lower_llama_xnnpack(
     return builder.to_executorch(passes=additional_passes)
 
 
+def _to_edge_and_lower_llama_openvino(
+    builder_exported,
+    modelname,
+    quantizers,
+    additional_passes,
+    openvino_device: str = "CPU",
+    verbose: bool = False,
+) -> LLMEdgeManager:  # noqa: C901
+    partitioners = []
+
+    # Add OpenVINO partitioner
+    partitioners.append(get_openvino_partitioner(openvino_device))
+    modelname = f"openvino_{modelname}"
+
+    logging.info("Lowering model using following partitioner(s): ")
+    for partitioner in partitioners:
+        logging.info(f"--> {partitioner.__class__.__name__}")
+
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+        partitioners
+    )
+
+    if verbose:
+        print_delegation_info(builder.edge_manager.exported_program().graph_module)
+
+    return builder.to_executorch(passes=additional_passes)
+
+
 def _to_edge_and_lower_llama(  # noqa: C901
     builder_exported,
     modelname,
@@ -1089,11 +1137,18 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
 
     if llm_config.backend.xnnpack.enabled:
         if llm_config.export.foundation_weights_file is not None:
-            gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
-                llm_config.export.foundation_weights_file
-                if "lora" not in x.name
-                else None
-            )
+            if llm_config.export.lora_weights_file is not None:
+                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
+                    llm_config.export.foundation_weights_file
+                    if "lora" not in x.name
+                    else None
+                )
+            else:
+                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
+                    llm_config.export.foundation_weights_file
+                    if "lora" not in x.name
+                    else llm_config.export.lora_weights_file
+                )
 
             from executorch.exir.passes.external_constants_pass import (
                 delegate_external_constants_pass_unlifted,
@@ -1124,6 +1179,15 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             generate_etrecord=llm_config.debug.generate_etrecord,
             verbose=llm_config.debug.verbose,
         )
+    elif llm_config.backend.openvino.enabled:
+        builder = _to_edge_and_lower_llama_openvino(
+            builder_exported,
+            modelname,
+            quantizers,
+            additional_passes,
+            openvino_device=llm_config.backend.openvino.device,
+            verbose=llm_config.debug.verbose,
+        )
     else:
         builder = _to_edge_and_lower_llama(
             builder_exported,
@@ -1238,12 +1302,15 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
-        EagerModelFactory.create_model(
-            module_name,
-            model_class_name,
-            llm_config=llm_config,
-        )
+    (
+        model,
+        example_inputs,
+        example_kwarg_inputs,
+        dynamic_shapes,
+    ) = EagerModelFactory.create_model(
+        module_name,
+        model_class_name,
+        llm_config=llm_config,
     )
     # Convert dtype override string to actual type.
     dtype_override = DType[llm_config.model.dtype_override.value]
@@ -1322,6 +1389,7 @@ def _get_source_transforms(  # noqa
     local_global_attention: Optional[List[int]] = None,
     use_torchao_kernels_linear: bool = False,
     use_torchao_kernels_tied_embedding: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1391,7 +1459,10 @@ def _get_source_transforms(  # noqa
         """
         transforms.append(
             get_quant_embedding_transform(
-                embedding_quantize, use_shared_embedding, checkpoint_dtype
+                embedding_quantize,
+                use_shared_embedding,
+                checkpoint_dtype,
+                quantize_with_hqq,
             )
         )
 
@@ -1422,6 +1493,7 @@ def _get_source_transforms(  # noqa
                 calibration_tasks=calibration_tasks,
                 calibration_limit=calibration_limit,
                 calibration_seq_length=calibration_seq_length,
+                quantize_with_hqq=quantize_with_hqq,
             )
         )
 
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 3a325d0f4f8..6587f7e1a10 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -15,6 +15,7 @@
 from executorch.examples.models.llama.attention import (
     Attention,
     ATTENTION_REGISTRY,
+    AttentionSkip,
     ForwardOptions,
 )
 from executorch.examples.models.llama.feed_forward import FeedForward
@@ -95,7 +96,10 @@ def __init__(self, args: ModelArgs, attention: Attention):
         else:
             self.feed_forward = FeedForward(dim=args.dim, hidden_dim=args.hidden_dim)
 
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        if isinstance(self.attention, AttentionSkip):
+            self.attention_norm = nn.Identity()
+        else:
+            self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
     @classmethod
@@ -120,8 +124,9 @@ def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x:
         h, attn_options_update = self.attention.forward(
             self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
         )
+        if not isinstance(self.attention, AttentionSkip):
+            h = x + h
 
-        h = x + h
         if hasattr(self, "block_sparse_moe"):
             out = h + self.block_sparse_moe(self.ffn_norm(h))
         else:
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
index 04d29f91ac6..3f9d3d8f2af 100644
--- a/examples/models/llama/model_args.py
+++ b/examples/models/llama/model_args.py
@@ -63,6 +63,9 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
+    # Device to use for the model: "cpu" or "cuda" (needed for QAT)
+    # Only used for creating Rope parameters
+    device: str = "cpu"
     # Generate logits for all inputs. When it's True, it would take big memory usage
     # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
     # logits for all input tokens.)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 8c0d5db6a80..ea4e6b37243 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -138,7 +138,11 @@ def forward(
 # and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
 # Current only support non-long rope.
 def hf_precompute_freqs_cis(
-    dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+    dim: int,
+    end: int,
+    theta: float,
+    partial_rotary_factor: float = 1.0,
+    device: Union[str, torch.device] = "cpu",
 ):
     # Partial rotary embeddings.
     dim = int(dim * partial_rotary_factor)
@@ -146,7 +150,7 @@ def hf_precompute_freqs_cis(
     # Short factor scaling.
     freqs = 1.0 / (
         theta
-        ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
+        ** (torch.arange(0, dim, 2, device=device, dtype=torch.int64).float() / dim)
     )
     # TODO: support long factor scaling.
 
@@ -236,6 +240,7 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = partial(
                 hf_precompute_freqs_cis,
                 partial_rotary_factor=self.params.partial_rotary_factor,
+                device=getattr(self.params, "device", "cpu"),
             )
             self.apply_rotary_emb = hf_apply_rotary_emb
         else:
@@ -244,6 +249,7 @@ def __init__(self, params: ModelArgs):
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
                 high_freq_factor=self.params.high_freq_factor,
+                device=getattr(self.params, "device", "cpu"),
             )
             self.apply_rotary_emb = RotaryEmbedding()
 
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 2ba2fdf9941..19ed9f88339 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -37,6 +37,21 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& tokenizer_path,
     std::optional<const std::string> data_path,
     float temperature) {
+  if (data_path.has_value()) {
+    std::vector<std::string> data_files;
+    data_files.push_back(data_path.value());
+    return create_llama_runner(
+        model_path, tokenizer_path, std::move(data_files), temperature);
+  }
+  return create_llama_runner(
+      model_path, tokenizer_path, std::vector<std::string>(), temperature);
+}
+
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::vector<std::string> data_files,
+    float temperature) {
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
@@ -55,7 +70,7 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     return nullptr;
   }
   return llm::create_text_llm_runner(
-      model_path, std::move(tokenizer), data_path);
+      model_path, std::move(tokenizer), data_files);
 }
 
 } // namespace example
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index f07cd4e8ee8..728ae57efa8 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -11,12 +11,9 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/runner/irunner.h>
@@ -30,7 +27,13 @@ namespace llm = ::executorch::extension::llm;
 std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    std::optional<const std::string> data_path = std::nullopt,
+    std::optional<const std::string> data_path,
+    float temperature = -1.0f);
+
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::vector<std::string> data_files = {},
     float temperature = -1.0f);
 
 std::unique_ptr<tokenizers::Tokenizer> load_llama_tokenizer(
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
index e2d2bc40c60..06fbffbef83 100644
--- a/examples/models/llama/runner/static_attention_io_manager.h
+++ b/examples/models/llama/runner/static_attention_io_manager.h
@@ -586,12 +586,12 @@ class StaticAttentionIOManager {
    * of the prompt and method's input length. Returns the position in the output
    * that corresponds to the end of the prompt during the last inference.
    */
-  template <typename TokenT>
+  template <typename TokenT, typename LogitT>
   size_t prefill(
       executorch::runtime::Span<TokenT> tokens,
       executorch::runtime::Span<TokenT> input_buffer,
       executorch::runtime::Method& method,
-      std::function<void(executorch::runtime::Span<const float>)>
+      std::function<void(executorch::runtime::Span<const LogitT>)>
           logits_callback = nullptr) {
     ET_LOG(Info, "Prefilling at position %zu", input_pos_);
     size_t input_len = input_buffer.size();
@@ -619,7 +619,7 @@ class StaticAttentionIOManager {
           batch_len);
       if (logits_callback) {
         auto logits_tensor = method.get_output(0).toTensor();
-        auto* logits = logits_tensor.const_data_ptr<float>();
+        auto* logits = logits_tensor.const_data_ptr<LogitT>();
         logits_callback(executorch::runtime::Span(
             logits,
             logits + batch_len * logits_tensor.size(logits_tensor.dim() - 1)));
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 7cb65833f98..9e49f9e4e15 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -49,6 +49,7 @@ def quantize(  # noqa C901
     blocksize: int = 128,
     tokenizer_path: Optional[Path] = None,
     verbose: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> torch.nn.Module:
     """
     Quantizes a model by converting all weights to int8.
@@ -119,7 +120,6 @@ def quantize(  # noqa C901
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
-            MappingType,
             quantize_,
         )
         from torchao.utils import unwrap_tensor_subclass
@@ -134,9 +134,12 @@ def quantize(  # noqa C901
                     weight_granularity=(
                         PerAxis(0) if group_size == 0 else PerGroup(group_size)
                     ),
-                    weight_mapping_type=MappingType.SYMMETRIC,
                     # pyre-ignore[6]
                     intx_packing_format="opaque_torchao_auto",
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if quantize_with_hqq else "affine"
+                    ),
                 ),
             )
             model = unwrap_tensor_subclass(model)
@@ -170,6 +173,10 @@ def filter_fn(m, fqn):
                 # pyre-ignore[16]
                 weight_dtype=torch.int4,
                 weight_granularity=PerGroup(group_size),
+                # pyre-ignore[6]
+                intx_choose_qparams_algorithm=(
+                    "hqq_scale_only" if quantize_with_hqq else "affine"
+                ),
             ),
             filter_fn=filter_fn,
         )
@@ -191,6 +198,10 @@ def filter_fn(m, fqn):
             # pyre-ignore[16]
             weight_dtype=torch.int4,
             granularity=PerGroup(q_group_size),
+            # pyre-ignore[6]
+            intx_choose_qparams_algorithm=(
+                "hqq_scale_only" if quantize_with_hqq else "affine"
+            ),
         )
         quantize_(model, q_config)
         model = unwrap_tensor_subclass(model)
@@ -580,6 +591,7 @@ def __init__(
         group_size: Optional[int] = None,
         packed=False,
         precision: Optional[torch.dtype] = None,
+        quantize_with_hqq: bool = True,
     ):
         if isinstance(packed, str):
             packed = packed == "True"
@@ -592,15 +604,12 @@ def __init__(
         self.precision = precision
         if (bitwidth not in [2, 4]) and packed:
             raise RuntimeError("pack only works with bitsize 2, 4")
+        self.quantize_with_hqq = quantize_with_hqq
 
     @torch.no_grad()
     def create_quantized_state_dict(self, packed=False) -> Dict:
         from torchao.quantization.granularity import PerAxis, PerGroup
-        from torchao.quantization.quant_api import (
-            IntxWeightOnlyConfig,
-            MappingType,
-            quantize_,
-        )
+        from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
 
         cur_state_dict = self.mod.state_dict()
 
@@ -627,7 +636,10 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                         if (self.group_size is None or self.group_size == 0)
                         else PerGroup(self.group_size)
                     ),
-                    mapping_type=MappingType.SYMMETRIC,
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if self.quantize_with_hqq else "affine"
+                    ),
                 )
                 quantize_(tmp_model, config, lambda m, fqn: isinstance(m, nn.Embedding))
                 weight = tmp_model.weight.qdata  # pyre-ignore[16]
@@ -765,6 +777,7 @@ def get_quant_embedding_transform(
     embedding_quantize: str,
     use_shared_embedding: bool = False,
     dtype_override: Optional[DType] = None,
+    quantize_with_hqq: bool = True,
 ):
     if embedding_quantize.startswith("torchao:"):
         from torchao.prototype.quantization.embedding.api import (
@@ -825,6 +838,7 @@ def _torchao_embedding_quantizer(model):
         group_size=group_size,
         packed=(bitwidth in [2, 4]),
         precision=torch_dtype,
+        quantize_with_hqq=quantize_with_hqq,
     ).quantized_model()
 
 
@@ -838,6 +852,7 @@ def get_quant_weight_transform(
     calibration_tasks: Optional[list] = None,
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
+    quantize_with_hqq: bool = True,
 ):
     return partial(
         quantize,
@@ -850,6 +865,7 @@ def get_quant_weight_transform(
         calibration_limit=calibration_limit,
         calibration_seq_length=calibration_seq_length,
         tokenizer_path=(Path(path) if (path := tokenizer_path) is not None else None),
+        quantize_with_hqq=quantize_with_hqq,
     )
 
 
@@ -877,7 +893,6 @@ def _load_torchao_aten_lib(libname):
 def set_8da4w_computation_dtype(
     module: nn.Module, computation_dtype: torch.dtype
 ) -> nn.Module:
-
     from torchao.quantization.linear_quant_modules import Int8DynActInt4WeightLinear
 
     def _set_8da4w_computation_dtype(module: nn.Module, dtype: torch.dtype) -> None:
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
index b42371dc090..95bae1b766a 100644
--- a/examples/models/llama/static_attention.py
+++ b/examples/models/llama/static_attention.py
@@ -259,7 +259,7 @@ def __init__(
         }
 
         rope = Rope(config)
-        freqs = rope.get_freqs(None, config.max_seq_len)
+        freqs = rope.get_freqs(None, config.max_context_len)
         self.freqs_cos = freqs[0].to(dtype)
         self.freqs_sin = freqs[1].to(dtype)
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index cfe691c7bd4..6df4caf8692 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 conda install -c conda-forge "ffmpeg<8" -y
-pip install torchcodec==0.7.0.dev20250906 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.4
 pip install bitsandbytes soundfile
 # Run llama2/install requirements for torchao deps
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
index be3c075913d..d0c3c2ceb15 100644
--- a/examples/models/moshi/mimi/test_mimi.py
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -156,7 +156,7 @@ def test_streaming_encoding_decoding(self):
         all_pcms_streaming = torch.cat(all_pcms_streaming, dim=-1)
         sqnr_streaming = compute_sqnr(pcm_ref, all_pcms_streaming)
         print(f"sqnr_streaming = {sqnr_streaming} dB")
-        self.assertTrue(sqnr_streaming > 100)
+        self.assertTrue(sqnr_streaming > 70)
 
     def test_exported_encoding(self):
         """Ensure exported encoding model is consistent with reference output."""
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 85c6a13e0ff..3995f5533e6 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -86,6 +86,13 @@ list(
   extension_flat_tensor
 )
 
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
 # Add tokenizers
 list(APPEND link_libraries tokenizers::tokenizers)
 
diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md
index 8cac4264bba..861043fe2a7 100644
--- a/examples/models/voxtral/README.md
+++ b/examples/models/voxtral/README.md
@@ -36,13 +36,36 @@ optimum-cli export executorch \
 
 This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit activation linear quantization.
 
+## CUDA Support
+If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:
+
+**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.
+
+### Exporting with CUDA
+```
+optimum-cli export executorch \
+  --model "mistralai/Voxtral-Mini-3B-2507" \
+  --task "multimodal-text-to-text" \
+  --recipe "cuda" \
+  --dtype bfloat16 \
+  --device cuda \
+  --max_seq_len 1024 \
+  --output_dir="voxtral"
+```
+
+This will generate:
+- `model.pte` - The exported model
+- `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
+
+See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
+
 # Running the model
 To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's MultiModal runner API.
 The Voxtral runner will do the following things:
 
 - Audio Input:
-  - Option A:  Pass the raw audio tensor into exported preprocessor to produce a mel spectrogram tensor.
-  - Option B:  If starting directly with an already processed audio input tensor, format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
+   - Option A:  Pass raw audio data from a `.wav` file into the exported preprocessor to produce a mel spectrogram tensor.
+   - Option B:  If starting directly with an already processed audio input tensor (preprocessed mel spectrogram), format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
 - Feed the formatted inputs to the multimodal modal runner.
 
 
@@ -56,6 +79,8 @@ python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --stack_
 ```
 
 ## Building the multimodal runner
+
+### Building for CPU (XNNPack)
 ```
 # Build and install ExecuTorch
 cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON && cmake --build cmake-out -j16 --target install --config Release
@@ -64,15 +89,54 @@ cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -
 cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -Bcmake-out/examples/models/voxtral examples/models/voxtral && cmake --build cmake-out/examples/models/voxtral -j16 --config Release
 ```
 
+### Building for CUDA
+```
+# Install ExecuTorch with CUDA support
+CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+
+# Build the multimodal runner with CUDA
+cmake --preset llm \
+      -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_INSTALL_PREFIX=cmake-out \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out -S.
+cmake --build cmake-out -j16 --target install --config Release
+
+cmake -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Sexamples/models/voxtral \
+      -Bcmake-out/examples/models/voxtral/
+cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+```
+
 ## Running the model
 You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).
+
+### Running with raw audio (.wav file)
+For raw audio files (`.wav`), you must provide a preprocessor to convert the audio into mel spectrogram format:
+```
+./cmake-out/examples/models/voxtral/voxtral_runner \
+  --model_path path/to/model.pte \
+  --tokenizer_path path/to/tekken.json \
+  --prompt "What can you tell me about this audio?" \
+  --audio_path path/to/audio_input.wav \
+  --processor_path path/to/voxtral_preprocessor.pte
+```
+
+### Running with preprocessed audio (.bin file)
+If you already have a preprocessed mel spectrogram saved as a `.bin` file, you can skip the preprocessor:
 ```
 ./cmake-out/examples/models/voxtral/voxtral_runner \
   --model_path path/to/model.pte \
   --tokenizer_path path/to/tekken.json \
   --prompt "What can you tell me about this audio?" \
-  --audio_path path/to/audio_input.bin \
-  --processor_path path/to/voxtral_preprocessor.pte # If you're passing raw audio file in audio_path
+  --audio_path path/to/preprocessed_audio.bin
+```
+
+
+**For CUDA:** Add the `--data_path` argument to provide the CUDA kernel blob to the commands above:
+```
+  --data_path path/to/aoti_cuda_blob.ptd
 ```
 
 Example output:
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 081df27cd67..b3dd5e3ab68 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -21,6 +21,7 @@
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
 #include <executorch/extension/llm/runner/multimodal_input.h>
 #include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/wav_loader.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
@@ -34,6 +35,7 @@ DEFINE_string(
     "multimodal.pte",
     "Model serialized in flatbuffer format.");
 
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");
 
 DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
@@ -113,15 +115,15 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 }
 
 /**
- * @brief Loads a .bin file into a tensor and processes it using a .pte
- * processor
+ * @brief Loads raw audio from a .bin or .wav file and processes it using a
+ * .pte processor
  *
- * This function loads raw audio data from a .bin file (similar to
- * loadPreprocessedAudio), creates a tensor from it, and then passes it through
- * a processor module loaded from a .pte file to generate processed audio
- * features.
+ * This function loads raw audio data from either a .bin file (raw float array)
+ * or a .wav file (WAV format with headers), creates a tensor from it, and then
+ * passes it through a processor module loaded from a .pte file to generate
+ * processed audio features.
  *
- * @param audio_path Path to the .bin audio file
+ * @param audio_path Path to the .bin or .wav audio file
  * @param processor_path Path to the .pte processor file
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file loading or processing fails
@@ -135,6 +137,41 @@ MultimodalInput processRawAudioFile(
         "Processor path is required for raw audio processing");
   }
 
+  // Load the audio data from file (.bin or .wav)
+  std::vector<float> audio_data;
+  if (ends_with(audio_path, ".wav")) {
+    audio_data = ::executorch::extension::llm::load_wav_audio_data(audio_path);
+    ET_LOG(
+        Info,
+        "Loaded WAV file: %s, %zu samples",
+        audio_path.c_str(),
+        audio_data.size());
+  } else if (ends_with(audio_path, ".bin")) {
+    std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+    if (!f.is_open()) {
+      ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
+      throw std::runtime_error("Failed to open audio file");
+    }
+
+    std::size_t n_floats = f.tellg() / sizeof(float);
+    f.seekg(0, std::ios::beg);
+
+    audio_data.resize(n_floats);
+    f.read(
+        reinterpret_cast<char*>(audio_data.data()),
+        audio_data.size() * sizeof(float));
+    f.close();
+
+    ET_LOG(
+        Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
+  } else {
+    ET_LOG(
+        Error,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
+        audio_path.c_str());
+    throw std::runtime_error("Unsupported audio file format");
+  }
+
   // Load the audio processor .pte.
   std::unique_ptr<Module> processor_module;
   try {
@@ -153,25 +190,6 @@ MultimodalInput processRawAudioFile(
     throw std::runtime_error("Exception while loading processor module");
   }
 
-  // Load the audio data from file.
-  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
-  if (!f.is_open()) {
-    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
-    throw std::runtime_error("Failed to open audio file");
-  }
-
-  std::size_t n_floats = f.tellg() / sizeof(float);
-  f.seekg(0, std::ios::beg);
-
-  std::vector<float> audio_data(n_floats);
-  f.read(
-      reinterpret_cast<char*>(audio_data.data()),
-      audio_data.size() * sizeof(float));
-  f.close();
-
-  ET_LOG(
-      Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
-
   // Execute the processor
   std::vector<executorch::aten::SizesType> tensor_shape = {
       static_cast<executorch::aten::SizesType>(audio_data.size())};
@@ -226,33 +244,39 @@ MultimodalInput processRawAudioFile(
  *
  * Dispatches audio file processing based on file extension and processor
  * availability:
+ * - .wav files: Requires processor, processes raw audio through processor
  * - .bin files with processor: Loads raw audio from .bin and processes through
  * processor
  * - .bin files without processor: Loads preprocessed mel spectrogram features
  * directly
  *
- * @param audio_path Path to the audio file (.bin)
- * @param processor_path Path to the processor .pte file (optional)
+ * @param audio_path Path to the audio file (.bin or .wav)
+ * @param processor_path Path to the processor .pte file (optional for .bin,
+ * required for .wav)
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file format is unsupported or processing fails
  */
 MultimodalInput processAudioFile(
     const std::string& audio_path,
     const std::string& processor_path = "") {
-  if (ends_with(audio_path, ".bin")) {
-    if (!processor_path.empty()) {
-      // Process raw audio from .bin file through the processor
-      return processRawAudioFile(audio_path, processor_path);
-    } else {
-      // Load preprocessed audio stored as a binary file (existing behavior)
-      return loadPreprocessedAudio(audio_path);
+  if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".bin")) {
+    if (processor_path.empty()) {
+      if (ends_with(audio_path, ".wav")) {
+        ET_CHECK_MSG(
+            false,
+            "Processor path is required for .wav file processing: %s",
+            audio_path.c_str());
+      } else {
+        // Load preprocessed audio stored as a binary file (existing behavior)
+        return loadPreprocessedAudio(audio_path);
+      }
     }
+    return processRawAudioFile(audio_path, processor_path);
   } else {
-    ET_LOG(
-        Error,
-        "Unsupported audio file format: %s (only .bin files are supported)",
+    ET_CHECK_MSG(
+        false,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
         audio_path.c_str());
-    throw std::runtime_error("Unsupported audio file format");
   }
 }
 
@@ -267,6 +291,7 @@ int32_t main(int32_t argc, char** argv) {
   const char* prompt = FLAGS_prompt.c_str();
   const char* audio_path = FLAGS_audio_path.c_str();
   const char* processor_path = FLAGS_processor_path.c_str();
+  const char* data_path = FLAGS_data_path.c_str();
   float temperature = FLAGS_temperature;
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
@@ -294,7 +319,7 @@ int32_t main(int32_t argc, char** argv) {
   // Create multimodal runner
   std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
       ::executorch::extension::llm::create_multimodal_runner(
-          model_path, std::move(tokenizer));
+          model_path, std::move(tokenizer), data_path);
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create multimodal runner");
     return 1;
diff --git a/examples/models/yolo12/.gitignore b/examples/models/yolo12/.gitignore
new file mode 100644
index 00000000000..02deda29710
--- /dev/null
+++ b/examples/models/yolo12/.gitignore
@@ -0,0 +1,3 @@
+*.pt
+*.pte
+*.ptd
diff --git a/examples/models/yolo12/README.md b/examples/models/yolo12/README.md
index 2260afa5dde..1a54f1a4a16 100644
--- a/examples/models/yolo12/README.md
+++ b/examples/models/yolo12/README.md
@@ -1,10 +1,11 @@
 # YOLO12 Detection C++ Inference with ExecuTorch
 
-This example demonstrates how to perform inference of [Ultralytics YOLO12 family](https://docs.ultralytics.com/models/yolo12/) detection models in C++ leveraging the Executorch backends:
+This example demonstrates how to perform inference of [YOLO12 family](https://docs.ultralytics.com/models/yolo12/) detection models in C++ leveraging the Executorch backends:
+
 - [OpenVINO](../../../backends/openvino/README.md)
 - [XNNPACK](../../../backends/xnnpack/README.md)
 
-# Performance Evaluation
+## Performance Evaluation
 
 | CPU                            | Model   | Backend  | Device | Precision | Average Latency, ms |
 |--------------------------------|---------|----------|--------|-----------|---------------------|
@@ -17,8 +18,7 @@ This example demonstrates how to perform inference of [Ultralytics YOLO12 family
 | Intel(R) Core(TM) Ultra 7 155H | yolo12s | xnnpack  | CPU    | FP32      | 169.36              |
 | Intel(R) Core(TM) Ultra 7 155H | yolo12l | xnnpack  | CPU    | FP32      | 436.876             |
 
-
-# Instructions
+## Instructions
 
 ### Step 1: Install ExecuTorch
 
@@ -31,35 +31,36 @@ To install ExecuTorch, follow this [guide](https://pytorch.org/executorch/stable
 
 ### Step 3: Install the demo requirements
 
-
 Python demo requirements:
+
 ```bash
 python -m pip install -r examples/models/yolo12/requirements.txt
 ```
 
 Demo infenrece dependency - OpenCV library:
-https://opencv.org/get-started/
-
-
-### Step 4: Export the Yolo12 model to the ExecuTorch
+<https://opencv.org/get-started/>
 
+### Step 4: Export the YOLO12 model to the ExecuTorch
 
 OpenVINO:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080]  --backend openvino --device CPU
 ```
 
 OpenVINO quantized model:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080]  --backend openvino --quantize --video_input /path/to/calibration/video --device CPU
 ```
 
 XNNPACK:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080] --backend xnnpack
 ```
 
-> **_NOTE:_**  Quantization for XNNPACK backend is WIP. Please refere to https://github.com/pytorch/executorch/issues/11523 for more details.
+> **_NOTE:_**  Quantization for XNNPACK backend is WIP. Please refere to <https://github.com/pytorch/executorch/issues/11523> for more details.
 
 Exported model could be validated using the `--validate` key:
 
@@ -70,8 +71,8 @@ python export_and_validate.py --model_name yolo12s --backend ... --validate data
 A list of available datasets and instructions on how to use a custom dataset can be found [here](https://docs.ultralytics.com/datasets/detect/).
 Validation only supports the default `--input_dims`; please do not specify this parameter when using the `--validate` flag.
 
-
 To get a full parameters description please use the following command:
+
 ```bash
 python export_and_validate.py --help
 ```
@@ -103,11 +104,11 @@ make -j$(nproc)
 ```
 
 To get a full parameters description please use the following command:
-```
+
+```bash
 ./build/Yolo12DetectionDemo --help
 ```
 
+## Credits
 
-# Credits:
-
-Ultralytics examples: https://github.com/ultralytics/ultralytics/tree/main/examples
+Ultralytics examples: <https://github.com/ultralytics/ultralytics/tree/main/examples>
diff --git a/examples/models/yolo12/export_and_validate.py b/examples/models/yolo12/export_and_validate.py
index e2349fb6434..ccd0db76d7d 100644
--- a/examples/models/yolo12/export_and_validate.py
+++ b/examples/models/yolo12/export_and_validate.py
@@ -35,7 +35,7 @@
 
 from ultralytics.data.utils import check_det_dataset
 from ultralytics.engine.validator import BaseValidator as Validator
-from ultralytics.utils.torch_utils import de_parallel
+from ultralytics.utils.torch_utils import unwrap_model
 
 
 class CV2VideoIter:
@@ -293,7 +293,7 @@ def _prepare_validation(
     stride = 32  # default stride
     validator.stride = stride  # used in get_dataloader() for padding
     validator.data = check_det_dataset(dataset_yaml_path)
-    validator.init_metrics(de_parallel(model))
+    validator.init_metrics(unwrap_model(model))
 
     data_loader = validator.get_dataloader(
         validator.data.get(validator.args.split), validator.args.batch
diff --git a/examples/nxp/README.md b/examples/nxp/README.md
index bb503ffd288..ef3153f2c91 100644
--- a/examples/nxp/README.md
+++ b/examples/nxp/README.md
@@ -1,20 +1,46 @@
-# PyTorch Model Delegation to Neutron Backend
+# ExecuTorch Neutron Backend examples
+This directory contains examples demonstrating the use of ExecuTorch AoT flow to convert a PyTorch model to ExecuTorch
+format and delegate the model computation to eIQ Neutron NPU using the eIQ Neutron Backend.
 
-In this guide we will show how to use the ExecuTorch AoT flow to convert a PyTorch model to ExecuTorch format and delegate the model computation to eIQ Neutron NPU using the eIQ Neutron Backend.
+## Layout
+* `experimental/` - contains CifarNet model example.
+* `models` - various example models.
+* `aot_neutron_compile.py` - script with end-to-end ExecuTorch AoT Neutron Backend workflow.
+* `README.md` - this file.
+* `run_aot_example.sh` - utility script for aot_neutron_compile.py.
+* `setup.sh` - setup script for Neutron Converter installation.
 
-First we will start with an example script converting the model. This example show the CifarNet model preparation. It is the same model which is part of the `example_cifarnet`
+## Setup
+Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup).
 
-The steps are expected to be executed from the executorch root folder.
-1. Run the setup.sh script to install the neutron-converter:
+Run the setup.sh script to install the neutron-converter:
 ```commandline
-$ examples/nxp/setup.sh
+$ ./examples/nxp/setup.sh
 ```
 
-2. Now run the `aot_neutron_compile.py` example with the `cifar10` model 
-```commandline
-$ python -m examples.nxp.aot_neutron_compile --quantize \
-    --delegate --neutron_converter_flavor SDK_25_09 -m cifar10 
-```
+## Supported models
+* CifarNet
+* MobileNetV2
+
+## PyTorch Model Delegation to Neutron Backend
+First we will start with an example script converting the model. This example show the CifarNet model preparation. 
+It is the same model which is part of the `example_cifarnet` in 
+[MCUXpresso SDK](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-software-development-kit-sdk:MCUXpresso-SDK).
+
+The NXP MCUXpresso software and tools offer comprehensive development solutions designed to help accelerate embedded 
+system development of applications based on MCUs from NXP. The MCUXpresso SDK includes a flexible set of peripheral 
+drivers designed to speed up and simplify development of embedded applications.
+
+The steps are expected to be executed from the `executorch` root folder.
+
+1. Run the `aot_neutron_compile.py` example with the `cifar10` model 
+    ```commandline
+    $ python -m examples.nxp.aot_neutron_compile --quantize \
+        --delegate --neutron_converter_flavor SDK_25_09 -m cifar10 
+    ```
 
-3. It will generate you `cifar10_nxp_delegate.pte` file which can be used with the MXUXpresso SDK `cifarnet_example` project, presented [here](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html#how-to-build-and-run-executorch-cifarnet-example).
-To get the MCUXpresso SDK follow this [guide](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/getting_mcuxpresso.html), use the MCUXpresso SDK v25.03.00. 
\ No newline at end of file
+2. It will generate you `cifar10_nxp_delegate.pte` file which can be used with the MCUXpresso SDK `cifarnet_example` 
+project, presented [here](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html#how-to-build-and-run-executorch-cifarnet-example).
+This project will guide you through the process of deploying your PTE model to the device.
+To get the MCUXpresso SDK follow this [guide](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/getting_mcuxpresso.html),
+use the MCUXpresso SDK v25.09.00. 
diff --git a/examples/openvino/README.md b/examples/openvino/README.md
index 8856ccdce4e..83e3daf6849 100644
--- a/examples/openvino/README.md
+++ b/examples/openvino/README.md
@@ -9,7 +9,10 @@ Below is the layout of the `examples/openvino` directory, which includes the nec
 ```
 examples/openvino
 ├── README.md                           # Documentation for examples (this file)
-└── aot_optimize_and_infer.py           # Example script to export and execute models
+├── aot_optimize_and_infer.py           # Example script to export and execute models
+└── llama
+    ├── README.md                       # Documentation for Llama example
+    └── llama3_2_ov_4wo.yaml            # Configuration file for exporting Llama3.2 with OpenVINO backend
 ```
 
 # Build Instructions for Examples
@@ -154,7 +157,7 @@ Build the backend libraries and executor runner by executing the script below in
 ```bash
 ./openvino_build.sh
 ```
-The executable is saved in `<executorch_root>/cmake-out/backends/openvino/`
+The executable is saved in `<executorch_root>/cmake-out/`
 
 ### Run the Example with Executor Runner
 
@@ -163,9 +166,9 @@ Now, run the example using the executable generated in the above step. The execu
 #### Command Syntax:
 
 ```
-cd ../../cmake-out/backends/openvino
+cd ../../cmake-out
 
-./openvino_executor_runner \
+./executor_runner \
     --model_path=<path_to_model> \
     --num_executions=<iterations>
 ```
@@ -179,7 +182,7 @@ cd ../../cmake-out/backends/openvino
 Run inference with a given model for 10 iterations:
 
 ```
-./openvino_executor_runner \
+./executor_runner \
     --model_path=model.pte \
     --num_executions=10
 ```
diff --git a/examples/openvino/llama/README.md b/examples/openvino/llama/README.md
new file mode 100644
index 00000000000..a98645b3918
--- /dev/null
+++ b/examples/openvino/llama/README.md
@@ -0,0 +1,45 @@
+
+# Export Llama with OpenVINO Backend
+
+## Download the Model
+Follow the [instructions](../../../examples/models/llama/README.md#step-2-prepare-model) to download the required model files. Export Llama with OpenVINO backend is only verified with Llama-3.2-1B variants at this time.
+
+## Environment Setup
+Follow the [instructions](../../../backends/openvino/README.md) of **Prerequisites** and **Setup** in `backends/openvino/README.md` to set up the OpenVINO backend.
+
+## Export the model:
+Navigate into `<executorch_root>/examples/openvino/llama` and execute the commands below to export the model. Update the model file paths to match the location where your model is downloaded. Replace device with the target hardware you want to compile the model for (`CPU`, `GPU`, or `NPU`). The exported model will be generated in the same directory with the filename `llama3_2_ov.pte`. For modifying the output name, change `output_name` in `llama3_2_ov_4wo.yaml` file under `export`.
+
+```
+LLAMA_CHECKPOINT=<path/to/model/folder>/consolidated.00.pth
+LLAMA_PARAMS=<path/to/model/folder>/params.json
+LLAMA_TOKENIZER=<path/to/model/folder>/tokenizer.model
+
+python -m executorch.extension.llm.export.export_llm \
+  --config llama3_2_ov_4wo.yaml \
+  +backend.openvino.device="CPU" \
+  +base.model_class="llama3_2" \
+  +base.checkpoint="${LLAMA_CHECKPOINT:?}" \
+  +base.params="${LLAMA_PARAMS:?}" \
+  +base.tokenizer_path="${LLAMA_TOKENIZER:?}"
+```
+
+### Compress Model Weights and Export
+OpenVINO backend also offers Quantization support for llama models when exporting the model. The different quantization modes that are offered are INT4 groupwise & per-channel weights compression and INT8 per-channel weights compression. It can be achieved by setting `pt2e_quantize` option in `llama3_2_ov_4wo.yaml` file under `quantization`. Set this parameter to `openvino_4wo` for INT4 or `openvino_8wo` for INT8 weight compression. It is set to `openvino_4wo` in `llama3_2_ov_4wo.yaml` file by default. For modifying the group size, set `group_size` option in `llama3_2_ov_4wo.yaml` file under `quantization`. By default group size 128 is used to achieve optimal performance with the NPU.
+
+## Build OpenVINO C++ Runtime with Llama Runner:
+First, build the backend libraries by executing the script below in `<executorch_root>/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --cpp_runtime
+```
+Then, build the llama runner by executing the script below (with `--llama_runner` argument) also in `<executorch_root>/backends/openvino/scripts` folder:
+```bash
+./openvino_build.sh --llama_runner
+```
+The executable is saved in `<executorch_root>/cmake-out/examples/models/llama/llama_main`
+
+## Execute Inference Using Llama Runner
+Update the model tokenizer file path to match the location where your model is downloaded and replace the prompt.
+```
+./cmake-out/examples/models/llama/llama_main --model_path=<executorch_root>/examples/openvino/llama/llama3_2.pte --tokenizer_path=<path/to/model/folder>/tokenizer.model --prompt="Your custom prompt"
+```
diff --git a/examples/openvino/llama/llama3_2_ov_4wo.yaml b/examples/openvino/llama/llama3_2_ov_4wo.yaml
new file mode 100644
index 00000000000..8fb1d7a1c09
--- /dev/null
+++ b/examples/openvino/llama/llama3_2_ov_4wo.yaml
@@ -0,0 +1,21 @@
+base:
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+
+model:
+  use_kv_cache: true
+  dtype_override: fp32
+  enable_dynamic_shape: false
+
+quantization:
+  pt2e_quantize: "openvino_4wo"
+  group_size: 128
+
+export:
+  output_name: "llama3_2_ov.pte"
+
+backend:
+  openvino:
+    enabled: true
+
+debug:
+  verbose: false
diff --git a/examples/portable/custom_ops/CMakeLists.txt b/examples/portable/custom_ops/CMakeLists.txt
index 4188554af79..8e679697b47 100644
--- a/examples/portable/custom_ops/CMakeLists.txt
+++ b/examples/portable/custom_ops/CMakeLists.txt
@@ -117,8 +117,14 @@ list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 add_executable(custom_ops_executor_runner ${_executor_runner__srcs})
 target_link_libraries(
-  custom_ops_executor_runner custom_ops_lib executorch extension_evalue_util
-  extension_runner_util gflags
+  custom_ops_executor_runner
+  custom_ops_lib
+  executorch
+  extension_evalue_util
+  extension_runner_util
+  gflags
+  extension_data_loader
+  extension_flat_tensor
 )
 target_compile_options(
   custom_ops_executor_runner PUBLIC ${_common_compile_options}
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 5ce872eec8e..0974e751203 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -26,6 +26,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/executor/method.h>
@@ -50,6 +51,7 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(inputs, "", "Comma-separated list of input files");
 DEFINE_string(
     output_file,
@@ -72,6 +74,7 @@ DEFINE_int32(
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
@@ -171,6 +174,34 @@ int main(int argc, char** argv) {
       "FileDataLoader::from() failed: 0x%" PRIx32,
       (uint32_t)loader.error());
 
+  // Load .ptd file if provided
+  std::unique_ptr<FileDataLoader> ptd_loader;
+  std::unique_ptr<FlatTensorDataMap> ptd_data_map;
+  if (!FLAGS_data_path.empty()) {
+    const char* data_path = FLAGS_data_path.c_str();
+    Result<FileDataLoader> ptd_loader_result = FileDataLoader::from(data_path);
+    ET_CHECK_MSG(
+        ptd_loader_result.ok(),
+        "FileDataLoader::from() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_loader_result.error());
+    ptd_loader =
+        std::make_unique<FileDataLoader>(std::move(ptd_loader_result.get()));
+    ET_LOG(Info, "PTD file %s is loaded.", data_path);
+
+    Result<FlatTensorDataMap> ptd_data_map_result =
+        FlatTensorDataMap::load(ptd_loader.get());
+    ET_CHECK_MSG(
+        ptd_data_map_result.ok(),
+        "FlatTensorDataMap::load() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_data_map_result.error());
+    ptd_data_map = std::make_unique<FlatTensorDataMap>(
+        std::move(ptd_data_map_result.get()));
+    ET_LOG(
+        Info,
+        "PTD data map created with %" PRIu64 " keys.",
+        static_cast<uint64_t>(ptd_data_map->get_num_keys().get()));
+  }
+
   std::vector<std::string> inputs_storage;
   std::vector<std::pair<char*, size_t>> input_buffers;
 
@@ -294,7 +325,10 @@ int main(int argc, char** argv) {
   //
   EventTraceManager tracer;
   Result<Method> method = program->load_method(
-      method_name, &memory_manager, tracer.get_event_tracer());
+      method_name,
+      &memory_manager,
+      tracer.get_event_tracer(),
+      ptd_data_map.get());
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index 0af45d85075..d1304a84bcb 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
         ],
         external_deps = [
@@ -38,6 +39,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
             "//executorch/extension/threadpool:cpuinfo_utils",
             "//executorch/extension/threadpool:threadpool",
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index 355209f43a7..31443f2d356 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -111,12 +111,13 @@ This section outlines the essential APIs and utilities provided to streamline th
    Creates a clean directory for storing model outputs or intermediate results. If the directory already exists, it will be deleted and recreated to ensure a consistent environment for each run.
 
 ## Additional Dependency
+This example requires the following Python packages:
+- pandas and scikit-learn: used in the mobilebert multi-class text classification example.
+- graphviz (optional): used for visualizing QNN graphs during debugging.
 
-The mobilebert multi-class text classification example requires `pandas` and `sklearn`.
 Please install them by something like
-
 ```bash
-pip install scikit-learn pandas
+pip install scikit-learn pandas graphviz
 ```
 
 ## Limitation
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 1be94ec04d6..be25324d63d 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -5,12 +5,13 @@ This file provides you the instructions to run LLM Decoder model with different
  1. LLAMA2 Stories 110M
  2. LLAMA3.2 1B
  3. LLAMA3.2 3B
- 4. Gemma3 1B
- 5. Phi4-mini-instruct
- 6. QWEN2.5 0.5B / 1.5B
- 7. QWEN3 0.6B / 1.7B
- 8. SmolLM2 135M
- 9. SmolLM3 3B
+ 4. Gemma 2B
+ 5. Gemma3 1B
+ 6. Phi4-mini-instruct
+ 7. QWEN2.5 0.5B / 1.5B
+ 8. QWEN3 0.6B / 1.7B
+ 9. SmolLM2 135M
+ 10. SmolLM3 3B
  
 
 We offer the following modes to execute the model:
@@ -37,6 +38,7 @@ We offer the following modes to execute the model:
 ### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
 2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend.
+3. Please install the llm eval dependency via [examples/models/llama/install_requirements.sh](https://github.com/pytorch/executorch/blob/main/examples/models/llama/install_requirements.sh)
 
 ### Step 2: Prepare Model
 
@@ -78,6 +80,13 @@ Default example using kv mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
+#### Gemma 2B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model gemma-2b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
+```
+
+
 #### Gemma3 1B
 Default example using hybrid mode
 ```bash
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 5908fcf32a6..628defc1496 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -24,6 +24,7 @@
 )
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 
+from executorch.examples.models.gemma import convert_weights as convert_gemma_weights
 from executorch.examples.models.gemma3 import convert_weights as convert_gemma3_weights
 from executorch.examples.models.phi_4_mini import (
     convert_weights as convert_phi_4_mini_weights,
@@ -300,6 +301,36 @@ class Llama3_2_3B_Instruct(LLMModelConfig):
     )
 
 
+@register_llm_model("gemma-2b")
+@dataclass(init=False, frozen=True)
+class Gemma_2B(LLMModelConfig):
+    repo_id: str = "google/gemma-2b-it"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/gemma/config/2b_config.json"
+    )
+    convert_weights = convert_gemma_weights
+    transform_weight = False
+    instruct_model = True
+
+    num_sharding = 4
+    # quant config
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 64
+    masked_softmax = True
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    quantization_config_wv_sha_16a8w = get_ptq_per_channel_quant_config(
+        torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
+    )
+    custom_annotation = (
+        annotate_kv_8bit,
+        annotate_output_16a8w,
+        partial(annotate_wv_sha, quantization_config=quantization_config_wv_sha_16a8w),
+    )
+
+
 @register_llm_model("gemma3-1b")
 @dataclass(init=False, frozen=True)
 class Gemma3(LLMModelConfig):
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
index ac96770b889..d43ceb8351a 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -14,6 +14,7 @@
 DECODER_MODEL_VERSION = {
     "stories260k": "llama2",
     "stories110m": "llama2",
+    "gemma-2b": "gemma",
     "gemma3-1b": "gemma3",
     "phi_4_mini": "phi_4_mini",
     "llama3_2-1b_instruct": "llama3",
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index ae5ae63d509..887e680341f 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -327,6 +327,13 @@ def quantize(
                     chat_template, args.prompt[0], args.system_prompt
                 )
             )
+
+            # Gemma may produce unexpected output if the prompt contains an extra <bos> token.
+            # This can happen after applying a prompt template, which might inject <bos> unintentionally.
+            # To prevent decoding issues, we explicitly remove <bos> token
+            if chat_template and args.decoder_model in {"gemma-2b", "gemma3-1b"}:
+                prompt = prompt.replace("<bos>", "")
+
             graph_module_inference(
                 use_kv_cache=self.llama_meta["get_use_kv_cache"],
                 get_example_inputs=self.get_example_inputs,
@@ -534,14 +541,13 @@ def compile(
         state_dict = torch.load(
             checkpoint, weights_only=True, map_location="cpu", mmap=True
         )
-        if args.decoder_model == "gemma3-1b":
+        if args.decoder_model in {"gemma-2b", "gemma3-1b"}:
             for k, v in state_dict.items():
                 if "norm" not in k:
                     continue
                 # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
                 # See https://github.com/huggingface/transformers/pull/29402
                 state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32)
-
     else:
         state_dict = torch.load(
             args.checkpoint, weights_only=True, map_location="cpu", mmap=True
@@ -1286,7 +1292,11 @@ def export_llama(args) -> None:
         )
         tokenizer_artifacts = tokenizer.save_pretrained(args.artifact)
         tokenizer_config = tokenizer_artifacts[0]
-        runtime_tokenizer_path = tokenizer_artifacts[-1]
+        if args.decoder_model == "gemma-2b":
+            # For Gemma, use tokenizer.model as it doesn't provide pre_tokenizer in tokenizer.json.
+            runtime_tokenizer_path = tokenizer_artifacts[-3]
+        else:
+            runtime_tokenizer_path = tokenizer_artifacts[-1]
         tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config)
 
     # TODO: Remove this once error is resolved.
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
index 71eaea2b8d6..2bffb35852a 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -9,7 +9,7 @@
 /**
  * @file
  *
- * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma3 1B,
+ * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma 2B, Gemma3 1B,
  * phi4-mini-instruct, Qwen2.5 0.5B / 1.5B, Qwen3 0.6B / 1.7B, SmolLM2 135M,
  * SmolLM3 3B with Qualcomm AI Engine Direct.
  *
@@ -117,6 +117,7 @@ std::string get_formatted_prompt(
       formatted_prompt.append(
           "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
       break;
+    case example::DecoderModelVersion::kGemma:
     case example::DecoderModelVersion::kGemma3:
       formatted_prompt.append("<start_of_turn>user\n");
       formatted_prompt.append(prompt);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index fe45d4b6a67..0c4884bbccf 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -122,6 +122,8 @@ Runner<T>::Runner(
     decoder_model_version_ = DecoderModelVersion::kLlama2;
   } else if (decoder_model_version == "llama3") {
     decoder_model_version_ = DecoderModelVersion::kLlama3;
+  } else if (decoder_model_version == "gemma") {
+    decoder_model_version_ = DecoderModelVersion::kGemma;
   } else if (decoder_model_version == "gemma3") {
     decoder_model_version_ = DecoderModelVersion::kGemma3;
     cache_mode_ = CacheMode::HybridCache;
@@ -199,7 +201,9 @@ Error Runner<T>::load() {
       decoder_model_version_ == DecoderModelVersion::kSmollm2_135m ||
       decoder_model_version_ == DecoderModelVersion::kSmollm3) {
     eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
-  } else if (decoder_model_version_ == DecoderModelVersion::kGemma3) {
+  } else if (
+      decoder_model_version_ == DecoderModelVersion::kGemma ||
+      decoder_model_version_ == DecoderModelVersion::kGemma3) {
     eos_ids->insert(tokenizer_->encode("<end_of_turn>", 0, 0).get()[0]);
   }
 
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index 9f290d79c75..1472093ab66 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -32,6 +32,7 @@ namespace example {
 enum DecoderModelVersion {
   kLlama2 = 0,
   kLlama3,
+  kGemma,
   kGemma3,
   kPhi4,
   kQwen2_5,
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 11b9ab88bfe..036c5060b12 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -918,24 +918,34 @@ def generate_inputs(dest_path: str, file_name: str, inputs=None):
     input_list_file = None
     input_files = []
 
+    def prepare_input_file(tensor, fd, index, sub_index):
+        # transform torch.Tensor to raw file
+        input_file_name = f"input_{index}_{sub_index}.raw"
+        input_file_path = f"{dest_path}/{input_file_name}"
+        if not isinstance(tensor, torch.Tensor):
+            tensor = torch.tensor(tensor)
+        tensor.detach().numpy().tofile(input_file_path)
+        input_files.append(input_file_path)
+        # prepare input_list
+        if sub_index > 0:
+            fd.write(" ")
+        fd.write(input_file_name)
+
     # Prepare input data
     if inputs is not None:
         input_list_file = f"{dest_path}/{file_name}"
         with open(input_list_file, "w") as f:
             for idx, data in enumerate(inputs):
-                for i, d in enumerate(data):
-                    # transform torch.Tensor to raw file
-                    file_name = f"input_{idx}_{i}.raw"
-                    file_path = f"{dest_path}/{file_name}"
-                    if not isinstance(d, torch.Tensor):
-                        d = torch.tensor(d)
-                    d.detach().numpy().tofile(file_path)
-                    input_files.append(file_path)
-
-                    # prepare input_list
-                    if i > 0:
-                        f.write(" ")
-                    f.write(file_name)
+                sub_index = 0
+                for d in data:
+                    if isinstance(d, (list, tuple)):
+                        for sub_d in d:
+                            prepare_input_file(sub_d, f, idx, sub_index)
+                            sub_index += 1
+                    else:
+                        prepare_input_file(d, f, idx, sub_index)
+                        sub_index += 1
+
                 f.write("\n")
 
     return input_list_file, input_files
diff --git a/examples/samsung/scripts/deeplab_v3.py b/examples/samsung/scripts/deeplab_v3.py
new file mode 100644
index 00000000000..b1e8fef65fe
--- /dev/null
+++ b/examples/samsung/scripts/deeplab_v3.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import Optional
+
+import torch
+import torchvision.transforms.v2 as vision_transform_v2
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet50Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+from torchvision.datasets import VOCSegmentation
+
+
+def get_dataset(
+    data_dir: str,
+    calinum=100,
+    input_transform_compose: Optional[vision_transform_v2.Compose] = None,
+    target_transform_compose: Optional[vision_transform_v2.Compose] = None,
+):
+    if not input_transform_compose:
+        input_transform_compose = vision_transform_v2.Compose(
+            [
+                vision_transform_v2.Resize([224, 224]),
+                vision_transform_v2.ToImage(),
+                vision_transform_v2.ToDtype(torch.float32, scale=True),
+                vision_transform_v2.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+                vision_transform_v2.Lambda(lambda x: x.unsqueeze(0)),  # Add batch dim
+            ]
+        )
+    if not target_transform_compose:
+        target_transform_compose = vision_transform_v2.Compose(
+            [
+                vision_transform_v2.Resize([224, 224]),
+                vision_transform_v2.ToImage(),
+                vision_transform_v2.ToDtype(torch.long, scale=False),
+                vision_transform_v2.Lambda(lambda x: x.unsqueeze(0)),  # Add batch dim
+            ]
+        )
+    voc_dataset = VOCSegmentation(
+        data_dir,
+        "2012",
+        "val",
+        transform=input_transform_compose,
+        target_transform=target_transform_compose,
+    )
+    example_input = [
+        (voc_dataset[i][0],) for i in range(min(calinum, len(voc_dataset)))
+    ]
+    return example_input
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=("path to the validation folder of VOC dataset. "),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./deeplab_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "deeplab_v3"
+    instance = DeepLabV3ResNet50Model()
+    model = DeepLabV3ResNet50Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs = get_dataset(
+            data_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/edsr.py b/examples/samsung/scripts/edsr.py
new file mode 100644
index 00000000000..f300a9c8547
--- /dev/null
+++ b/examples/samsung/scripts/edsr.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import List, Optional, Tuple
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.edsr import EdsrModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+from torchsr import transforms
+
+
+def get_dataset(
+    root_dir: str,
+    calinum=100,
+    transform_compose: Optional[transforms.Compose] = None,
+) -> Tuple:
+    """
+    Generate test data from B100 dataset for quantization model
+
+    :param root_dir: Dir of dataset. The real dataset should be in root_dir/SRBenchmarks/benchmark/
+    :param dataset_name: data_set name
+    :param testnum: Number of test data. Default 500
+    :param transform_compose: Transforms to be applied to data.
+        Default:
+        transform_compose = transforms.Compose(
+            [transforms.ToTensor()] # Convert Pillows Image to tensor
+        )
+    :type root_dir: str
+    :type calinum: int
+    :type testnum: int
+    :type transform_compose: transforms.Compose | None
+    :return: (example_input, cali_data, test_data)
+    """
+
+    class SrResize:
+        def __init__(self, expected_size: List[List[int]]):
+            self.expected_size = expected_size
+
+        def __call__(self, x):
+            return (
+                x[0].resize(self.expected_size[0]),
+                x[1].resize(self.expected_size[1]),
+            )
+
+    class SrUnsqueeze:
+        def __call__(self, x):
+            return (
+                x[0].unsqueeze(0),
+                x[1].unsqueeze(0),
+            )
+
+    if not transform_compose:
+        transform_compose = transforms.Compose(
+            [
+                SrResize([[448, 448], [224, 224]]),
+                transforms.ToTensor(),  # Convert Pillows Image to tensor
+                SrUnsqueeze(),
+            ]
+        )
+    from torchsr.datasets import B100
+
+    dataset = B100(root=root_dir, transform=transform_compose, scale=2)
+    example_data = [(dataset[i][1],) for i in range(min(calinum, len(dataset)))]
+    return example_data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=("path to the validation folder of B100"),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./edsr",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "edsr"
+    instance = EdsrModel()
+    model = EdsrModel().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs = get_dataset(
+            root_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/inception_v3.py b/examples/samsung/scripts/inception_v3.py
new file mode 100644
index 00000000000..77540285eab
--- /dev/null
+++ b/examples/samsung/scripts/inception_v3.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.inception_v3 import InceptionV3Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./inception_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "inception_v3"
+    instance = InceptionV3Model()
+    model = InceptionV3Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/inception_v4.py b/examples/samsung/scripts/inception_v4.py
new file mode 100644
index 00000000000..3140682998c
--- /dev/null
+++ b/examples/samsung/scripts/inception_v4.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.inception_v4 import InceptionV4Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (299, 299)
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./inception_v4",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "inception_v4"
+    instance = InceptionV4Model()
+    model = InceptionV4Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/mobilenet_v2.py b/examples/samsung/scripts/mobilenet_v2.py
new file mode 100644
index 00000000000..7c69de38e2c
--- /dev/null
+++ b/examples/samsung/scripts/mobilenet_v2.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.mobilenet_v2 import MV2Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./mobilenetV2",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "mobilenetV2_enn"
+    instance = MV2Model(False)
+    model = MV2Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/mobilenet_v3.py b/examples/samsung/scripts/mobilenet_v3.py
new file mode 100644
index 00000000000..3cc8eadf633
--- /dev/null
+++ b/examples/samsung/scripts/mobilenet_v3.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.mobilenet_v3 import MV3Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./mobilenet_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "mobilenet_v3"
+    instance = MV3Model()
+    model = MV3Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/resnet18.py b/examples/samsung/scripts/resnet18.py
new file mode 100644
index 00000000000..2f3233214ce
--- /dev/null
+++ b/examples/samsung/scripts/resnet18.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.resnet import ResNet18Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./resnet18",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "resnet18"
+    instance = ResNet18Model()
+    model = ResNet18Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/resnet50.py b/examples/samsung/scripts/resnet50.py
new file mode 100644
index 00000000000..1d6c348b641
--- /dev/null
+++ b/examples/samsung/scripts/resnet50.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.resnet import ResNet50Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./resnet50",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "resnet50"
+    instance = ResNet50Model()
+    model = ResNet50Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/vit.py b/examples/samsung/scripts/vit.py
new file mode 100644
index 00000000000..19c22c473cd
--- /dev/null
+++ b/examples/samsung/scripts/vit.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.torchvision_vit import TorchVisionViTModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./vision_transformer",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "vision_transformer"
+    instance = TorchVisionViTModel()
+    model = TorchVisionViTModel().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/wav2letter.py b/examples/samsung/scripts/wav2letter.py
new file mode 100644
index 00000000000..33069105d99
--- /dev/null
+++ b/examples/samsung/scripts/wav2letter.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import List
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.wav2letter import Wav2LetterModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+class DataManager:
+    class Encoder:
+        def __init__(self, vocab, blank_label="*"):
+            self.vocab = vocab
+            self.char_to_id = {c: i for i, c in enumerate(vocab)}
+            self.blank_label = blank_label
+
+        def encode(self, text):
+            return [self.char_to_id[c] for c in text.lower()]
+
+    @classmethod
+    def _get_voice_dataset(
+        cls, data_size: int, data_dir: str, labels: List[str], fixed_token_num: int
+    ):
+        from torch.utils.data import DataLoader
+        from torchaudio.datasets import LIBRISPEECH
+
+        def collate_fun(batch, encode_fn, mode="train"):
+            waves = []
+            text_ids = []
+            input_lengths = []
+            output_lengths = []
+
+            if mode == "train":
+                shifts = torch.randn(len(batch)) > 0.0
+
+            for i, (wave, _, text, *_) in enumerate(batch):
+                if mode == "train" and shifts[i]:
+                    wave = wave[:, 160:]
+                waves.append(wave[0])
+                ids = torch.LongTensor(encode_fn(text))
+                text_ids.append(ids)
+                input_lengths.append(wave.size(1) // 320)
+                output_lengths.append(len(ids))
+
+            waves = torch.nn.utils.rnn.pad_sequence(waves, batch_first=True).unsqueeze(
+                1
+            )
+            labels = torch.nn.utils.rnn.pad_sequence(text_ids, batch_first=True)
+
+            return waves, labels, input_lengths, output_lengths
+
+        encoder = cls.Encoder(labels)
+
+        testset_url = "test-clean"
+        dataset = LIBRISPEECH(data_dir, url=testset_url)
+        data_loader = DataLoader(
+            dataset=dataset,
+            batch_size=1,
+            shuffle=True,
+            collate_fn=lambda x: collate_fun(x, encoder.encode, "valid"),
+        )
+        # prepare input data
+        inputs, targets = [], []
+        in_lens, tar_lens = [], []
+
+        def _loader():
+            for waves, labels, inputs_len, targets_len in data_loader:
+                if inputs_len[0] >= fixed_token_num:
+                    continue
+                zero_padding = torch.zeros(
+                    [1, 1, fixed_token_num * 320 - waves.shape[2]]
+                )
+                waves = torch.concat((waves, zero_padding), axis=2)
+                yield waves, labels, [fixed_token_num + 1], targets_len
+
+        for i, (waves, labels, inputs_len, targets_len) in enumerate(
+            _loader()
+        ):  # waves, labels, input_lens, output_lens
+            inputs.append(waves)
+            targets.append(labels)
+            in_lens.append(inputs_len)
+            tar_lens.append(targets_len)
+            if i >= data_size:
+                break
+
+        return inputs, targets, in_lens, tar_lens
+
+    @classmethod
+    def get_dataset(
+        cls,
+        data_dir: str,
+        calinum=100,
+        fixed_out_token=300,
+        labels=None,
+    ):
+        if labels is None:
+            labels = [" ", *"abcdefghijklmnopqrstuvwxyz", "'", "*"]
+        dataset = cls._get_voice_dataset(calinum, data_dir, labels, fixed_out_token)
+        example_input = [(dataset[0][i],) for i in range(min(calinum, len(dataset[0])))]
+        return example_input
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-w",
+        "--weight",
+        default=None,
+        help="Absolute path of retrained w2l weight (With .pt format), the vocab size should 29",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./wav2letter",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "wav2letter"
+    instance = Wav2LetterModel()
+    instance.vocab_size = 29
+    model = instance.get_eager_model().eval()
+    if args.weight:
+        weight = torch.load(args.weight, weights_only=True)
+        model.load_state_dict(weight)
+    assert args.calibration_number
+    if args.dataset:
+        inputs = DataManager.get_dataset(
+            data_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/selective_build/advanced/CMakeLists.txt b/examples/selective_build/advanced/CMakeLists.txt
index 65ebb50bcac..fdef5e6555d 100644
--- a/examples/selective_build/advanced/CMakeLists.txt
+++ b/examples/selective_build/advanced/CMakeLists.txt
@@ -139,7 +139,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags ${selected_kernel_target}
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          extension_flat_tensor
+          extension_data_loader
+          ${selected_kernel_target}
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/examples/selective_build/basic/CMakeLists.txt b/examples/selective_build/basic/CMakeLists.txt
index 3cc68ad53b6..d74f94d7b3a 100644
--- a/examples/selective_build/basic/CMakeLists.txt
+++ b/examples/selective_build/basic/CMakeLists.txt
@@ -71,7 +71,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags executorch_kernels
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          executorch_kernels
+          extension_data_loader
+          extension_flat_tensor
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index c90b501df6f..dace37e5473 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -14,22 +14,18 @@
 import backends.vulkan.test.utils as test_utils
 
 import torch
+import torchvision
 
-from executorch.backends.transforms.convert_dtype_pass import I64toI32
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.devtools import BundledProgram
 from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-from executorch.exir import (
-    EdgeCompileConfig,
-    ExecutorchBackendConfig,
-    to_edge_transform_and_lower,
-)
+from executorch.exir import to_edge_transform_and_lower
 from executorch.extension.export_util.utils import save_pte_program
 from executorch.extension.pytree import tree_flatten
-from torch.export import export
+from torch.export import Dim, export
 
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
@@ -38,6 +34,67 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
 
+def is_vision_model(model_name):
+    if model_name in [
+        # These models are also registered in examples/models
+        "dl3",
+        "edsr",
+        "mv2",
+        "mv3",
+        "vit",
+        "ic3",
+        "ic4",
+        "resnet18",
+        "resnet50",
+        # These models are not registered in examples/models but are available via
+        # torchvision
+        "convnext_small",
+        "densenet161",
+        "shufflenet_v2_x1_0",
+    ]:
+        return True
+
+    return False
+
+
+def get_vision_model_sample_input():
+    return (torch.randn(1, 3, 224, 224),)
+
+
+def get_vision_model_dynamic_shapes():
+    return (
+        {
+            2: Dim("height", min=1, max=16) * 16,
+            3: Dim("width", min=1, max=16) * 16,
+        },
+    )
+
+
+def init_model(model_name):
+    if model_name == "convnext_small":
+        return torchvision.models.convnext_small()
+    if model_name == "densenet161":
+        return torchvision.models.densenet161()
+    if model_name == "shufflenet_v2_x1_0":
+        return torchvision.models.shufflenet_v2_x1_0()
+
+    return None
+
+
+def get_sample_inputs(model_name):
+    if is_vision_model(model_name):
+        return get_vision_model_sample_input()
+
+    return None
+
+
+def get_dynamic_shapes(model_name):
+    if is_vision_model(model_name):
+        return get_vision_model_dynamic_shapes()
+
+    return None
+
+
 def main() -> None:
     logger = logging.getLogger("")
     logger.setLevel(logging.INFO)
@@ -68,21 +125,6 @@ def main() -> None:
         help="whether to export with strict mode. Default is True",
     )
 
-    parser.add_argument(
-        "-a",
-        "--segment_alignment",
-        required=False,
-        help="specify segment alignment in hex. Default is 0x1000. Use 0x4000 for iOS",
-    )
-
-    parser.add_argument(
-        "-e",
-        "--external_constants",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Save constants in external .ptd file. Default is False",
-    )
-
     parser.add_argument(
         "-d",
         "--dynamic",
@@ -119,31 +161,35 @@ def main() -> None:
 
     args = parser.parse_args()
 
-    if args.model_name not in MODEL_NAME_TO_MODEL:
-        raise RuntimeError(
-            f"Model {args.model_name} is not a valid name. "
-            f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+    if args.model_name in MODEL_NAME_TO_MODEL:
+        model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
+            *MODEL_NAME_TO_MODEL[args.model_name]
         )
+    else:
+        model = init_model(args.model_name)
+        example_inputs = get_sample_inputs(args.model_name)
+        dynamic_shapes = get_dynamic_shapes(args.model_name) if args.dynamic else None
 
-    model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
-        *MODEL_NAME_TO_MODEL[args.model_name]
-    )
+        if model is None:
+            raise RuntimeError(
+                f"Model {args.model_name} is not a valid name. "
+                f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+            )
 
     # Prepare model
     model.eval()
 
     # Setup compile options
     compile_options = {}
-    if args.dynamic or dynamic_shapes is not None:
+    if args.dynamic:
         compile_options["require_dynamic_shapes"] = True
+        # Try to manually get the dynamic shapes for the model if not set
+        if dynamic_shapes is None:
+            dynamic_shapes = get_dynamic_shapes(args.model_name)
+
     if args.force_fp16:
         compile_options["force_fp16"] = True
 
-    # Configure Edge compilation
-    edge_compile_config = EdgeCompileConfig(
-        _skip_dim_order=False,  # Proper handling for Vulkan memory format
-    )
-
     logging.info(f"Exporting model {args.model_name} with Vulkan delegate")
 
     # Export the model using torch.export
@@ -157,10 +203,6 @@ def main() -> None:
     # Transform and lower with Vulkan partitioner
     edge_program = to_edge_transform_and_lower(
         program,
-        compile_config=edge_compile_config,
-        transform_passes=[
-            I64toI32(edge_compile_config._skip_dim_order),
-        ],
         partitioner=[VulkanPartitioner(compile_options)],
         generate_etrecord=args.etrecord,
     )
@@ -169,13 +211,8 @@ def main() -> None:
         f"Exported and lowered graph:\n{edge_program.exported_program().graph}"
     )
 
-    # Configure backend options
-    backend_config = ExecutorchBackendConfig(external_constants=args.external_constants)
-    if args.segment_alignment is not None:
-        backend_config.segment_alignment = int(args.segment_alignment, 16)
-
     # Create executorch program
-    exec_prog = edge_program.to_executorch(config=backend_config)
+    exec_prog = edge_program.to_executorch()
 
     # Save ETRecord if requested
     if args.etrecord:
diff --git a/exir/TARGETS b/exir/TARGETS
index 853d5e199ba..402e9a21bd1 100644
--- a/exir/TARGETS
+++ b/exir/TARGETS
@@ -79,6 +79,16 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "tensor_layout",
+    srcs = [
+        "tensor_layout.py",
+    ],
+    deps = [
+        ":scalar_type",
+    ]
+)
+
 runtime.python_library(
     name = "memory",
     srcs = [
diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index 1b8b76b7835..51bad73ab5c 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -64,5 +64,6 @@ runtime.python_library(
     deps = [
         "//executorch/exir:schema",
         "//executorch/exir:tensor",
+        "//executorch/exir:tensor_layout",
     ],
 )
diff --git a/exir/_serialize/_serialize.py b/exir/_serialize/_serialize.py
index e2147458545..06e81997654 100644
--- a/exir/_serialize/_serialize.py
+++ b/exir/_serialize/_serialize.py
@@ -16,12 +16,12 @@
     DataEntry,
     DataPayload,
     DataSerializer,
-    TensorLayout,
 )
 
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.emit import EmitterOutput
 from executorch.exir.schema import Tensor, TensorDataLocation
+from executorch.exir.tensor_layout import TensorLayout
 
 
 def serialize_for_executorch(
diff --git a/exir/_serialize/data_serializer.py b/exir/_serialize/data_serializer.py
index e828b4d0ae3..cee34506b66 100644
--- a/exir/_serialize/data_serializer.py
+++ b/exir/_serialize/data_serializer.py
@@ -3,7 +3,7 @@
 from typing import Dict, Optional, Sequence
 
 from executorch.exir._serialize._cord import Cord
-from executorch.extension.flat_tensor.serialize.flat_tensor_schema import TensorLayout
+from executorch.exir.tensor_layout import TensorLayout
 
 
 @dataclass
diff --git a/exir/backend/test/demos/rpc/CMakeLists.txt b/exir/backend/test/demos/rpc/CMakeLists.txt
index 97f90ea9baa..af843954601 100644
--- a/exir/backend/test/demos/rpc/CMakeLists.txt
+++ b/exir/backend/test/demos/rpc/CMakeLists.txt
@@ -36,7 +36,7 @@ target_include_directories(
 )
 install(
   TARGETS executor_backend
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 7daa3a247e8..06c1c78ee21 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -164,6 +164,14 @@ def get_propagated_const_tensor_dict(
         with torch.no_grad():
             # Execute the `node.target` and create a new propagated constant tensor.
             prop_constant_tensor = node.target(*args_data, **kwargs_data)
+
+            # ExecuTorch doesn't support zero strides, so we need to ensure the tensor is contiguous
+            # if it has any zero strides from broadcasting/expansion operations
+            if (
+                isinstance(prop_constant_tensor, torch.Tensor)
+                and 0 in prop_constant_tensor.stride()
+            ):
+                prop_constant_tensor = prop_constant_tensor.contiguous()
         const_node_to_tensor[node] = prop_constant_tensor
 
     return const_node_to_tensor
diff --git a/exir/tensor_layout.py b/exir/tensor_layout.py
new file mode 100644
index 00000000000..f8f77ebeea3
--- /dev/null
+++ b/exir/tensor_layout.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import List
+
+from executorch.exir.scalar_type import ScalarType
+
+
+# Note: keep this in sync with the TensorLayout definition in
+# executorch/extension/flat_tensor/serialize/flat_tensor.fbs
+@dataclass
+class TensorLayout:
+    scalar_type: ScalarType
+    sizes: List[int]
+    dim_order: List[int]
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 716b808b087..14f105e8205 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -24,7 +24,17 @@
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
     QuantizationConfig,
 )
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager, memory, to_edge
+from executorch.backends.xnnpack.utils.configs import (
+    get_xnnpack_executorch_backend_config,
+)
+
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    memory,
+    to_edge,
+    to_edge_transform_and_lower,
+)
 from executorch.exir.dialects._ops import bind_pattern_to_op, ops, ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.emit import emit_program
@@ -2022,3 +2032,64 @@ def forward(self, x):
         pass_result = constant_prop_pass(edge.exported_program())
         # 1 constant: a (= self.w @ self.cst)
         self.assertEqual(1, len(pass_result.constants))
+
+    def test_constant_prop_pass_zero_stride_tensors(self) -> None:
+        """
+        Test that constant propagation correctly handles tensors with zero strides
+        by converting them to contiguous tensors. Zero-stride tensors can be created
+        by operations like expand() and are not supported by ExecuTorch.
+        """
+
+        class ZeroStrideModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const_param = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0]))
+
+            def forward(self, x):
+                unsqueezed = self.const_param.unsqueeze(
+                    1
+                )  # Shape: (3, 1), strides: (1, 1)
+                # expand creates zero-stride tensor
+                expanded = unsqueezed.expand(3, 5)  # Shape: (3, 5), strides: (1, 0)
+
+                # Use the expanded tensor with the input to prevent elimination
+                result = x + expanded.sum()
+                return result
+
+        model = ZeroStrideModel()
+        x = torch.randn(3, 5)
+        exported = torch.export.export(model, (x,))
+
+        # Before constant prop: verify we have the parameter
+        self.assertIn("const_param", exported.state_dict)
+
+        const_prop_result = constant_prop_pass(exported)
+        lowered = to_edge_transform_and_lower(
+            const_prop_result,
+            partitioner=[XnnpackPartitioner()],
+        )
+
+        # Should go through
+        lowered.to_executorch(get_xnnpack_executorch_backend_config([SpecPropPass()]))
+        self.assertGreater(len(const_prop_result.constants), 0)
+
+        # Find the propagated constant tensor
+        prop_tensor = None
+        for constant_name, constant_tensor in const_prop_result.constants.items():
+            if constant_name.startswith("_prop_tensor_constant"):
+                prop_tensor = constant_tensor
+                break
+
+        # Verify the propagated tensor exists and has no zero strides
+        self.assertIsNotNone(prop_tensor)
+        self.assertNotIn(
+            0,
+            prop_tensor.stride(),
+            f"Propagated tensor still has zero stride: {prop_tensor.stride()}",
+        )
+
+        # Verify the tensor is contiguous
+        self.assertTrue(
+            prop_tensor.is_contiguous(),
+            f"Propagated tensor is not contiguous: {prop_tensor.stride()}",
+        )
diff --git a/export/export.py b/export/export.py
index 86a932d153c..1e9cdbde7c0 100644
--- a/export/export.py
+++ b/export/export.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -200,7 +201,9 @@ def _build_stages(self, stages: List[StageType]) -> Dict[StageType, Stage]:
                     aten_transform_passes = list(
                         self._export_recipe.aten_transform_passes
                     )
-                stage = TorchExportStage(aten_transform_passes)
+                stage = TorchExportStage(
+                    aten_transform_passes, strict=self._export_recipe.strict
+                )
             elif stage_type == StageType.TO_EDGE_TRANSFORM_AND_LOWER:
                 stage = EdgeTransformAndLowerStage.from_recipe(self._lowering_recipe)
             elif stage_type == StageType.TO_EDGE:
diff --git a/export/recipe.py b/export/recipe.py
index 18f4b8aebb9..4465da51956 100644
--- a/export/recipe.py
+++ b/export/recipe.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -151,6 +152,7 @@ class ExportRecipe:
         executorch_backend_config: Optional backend configuration for ExecuTorch
         pipeline_stages: Optional list of stages to execute, defaults to a standard pipeline.
         mode: Export mode (debug or release)
+        strict: Set the strict flag in the torch export call.
     """
 
     name: Optional[str] = None
@@ -163,6 +165,7 @@ class ExportRecipe:
     executorch_backend_config: Optional[ExecutorchBackendConfig] = None
     pipeline_stages: Optional[List[StageType]] = None
     mode: Mode = Mode.RELEASE
+    strict: bool = True
 
     @classmethod
     def get_recipe(cls, recipe: "RecipeType", **kwargs) -> "ExportRecipe":
diff --git a/export/stages.py b/export/stages.py
index 323b327bfa4..3be801c6a14 100644
--- a/export/stages.py
+++ b/export/stages.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -110,9 +111,11 @@ def __init__(
         aten_transform_passes: Optional[
             List[Callable[[str, ExportedProgram], ExportedProgram]]
         ] = None,
+        strict=True,
     ) -> None:
         super().__init__()
         self._aten_transform_passes = aten_transform_passes
+        self.strict = strict
 
     @property
     def stage_type(self) -> str:
@@ -147,7 +150,7 @@ def run(self, artifact: PipelineArtifact) -> None:
                     model,
                     example_inputs[method_name][0],
                     dynamic_shapes=method_dynamic_shapes,
-                    strict=True,
+                    strict=self.strict,
                 )
 
                 # Apply pre-edge transform passes if available
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 34a1d3d2fd0..38b28a1407a 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -18,7 +18,10 @@ endif()
 
 set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../..")
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 if(NOT ANDROID_PLATFORM)
   set(ANDROID_PLATFORM android-30)
 endif()
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index e36044e3da5..0c18d60721e 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -15,6 +15,7 @@ plugins {
 
 def qnnVersion = System.properties['qnnVersion']
 def execuTorchVersion = System.properties['execuTorchVersion']
+def flavor = System.properties['flavor']
 
 android {
     namespace = "org.pytorch.executorch"
@@ -69,7 +70,7 @@ mavenPublishing {
   publishToMavenCentral()
   signAllPublications()
 
-  coordinates("org.pytorch", "executorch-android" + (qnnVersion ? "-qnn" : ""), execuTorchVersion ? execuTorchVersion : "0.7.0-SNAPSHOT")
+  coordinates("org.pytorch", "executorch-android" + (flavor ? "-" + flavor : ""), execuTorchVersion ? execuTorchVersion : "1.0.0-SNAPSHOT")
 
   pom {
     name = "ExecuTorch Android"
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
index 289df5defd9..f135731f26a 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -11,6 +11,7 @@
 import com.facebook.jni.HybridData;
 import com.facebook.jni.annotations.DoNotStrip;
 import java.io.File;
+import java.util.List;
 import org.pytorch.executorch.ExecuTorchRuntime;
 import org.pytorch.executorch.annotations.Experimental;
 
@@ -32,14 +33,22 @@ public class LlmModule {
 
   @DoNotStrip
   private static native HybridData initHybrid(
-      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath);
+      int modelType,
+      String modulePath,
+      String tokenizerPath,
+      float temperature,
+      List<String> dataFiles);
 
   /**
    * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
-   * data path.
+   * dataFiles.
    */
   public LlmModule(
-      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+      int modelType,
+      String modulePath,
+      String tokenizerPath,
+      float temperature,
+      List<String> dataFiles) {
     ExecuTorchRuntime runtime = ExecuTorchRuntime.getRuntime();
 
     File modelFile = new File(modulePath);
@@ -50,12 +59,22 @@ public LlmModule(
     if (!tokenizerFile.canRead() || !tokenizerFile.isFile()) {
       throw new RuntimeException("Cannot load tokenizer path " + tokenizerPath);
     }
-    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataPath);
+
+    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataFiles);
+  }
+
+  /**
+   * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+   * data path.
+   */
+  public LlmModule(
+      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+    this(modelType, modulePath, tokenizerPath, temperature, List.of(dataPath));
   }
 
   /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
   public LlmModule(String modulePath, String tokenizerPath, float temperature) {
-    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, null);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of());
   }
 
   /**
@@ -63,12 +82,12 @@ public LlmModule(String modulePath, String tokenizerPath, float temperature) {
    * path.
    */
   public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) {
-    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, dataPath);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of(dataPath));
   }
 
   /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
   public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
-    this(modelType, modulePath, tokenizerPath, temperature, null);
+    this(modelType, modulePath, tokenizerPath, temperature, List.of());
   }
 
   /** Constructs a LLM Module for a model with the given LlmModuleConfig */
diff --git a/extension/android/jni/jni_helper.cpp b/extension/android/jni/jni_helper.cpp
index b92856bacb2..6491524c7ac 100644
--- a/extension/android/jni/jni_helper.cpp
+++ b/extension/android/jni/jni_helper.cpp
@@ -13,10 +13,13 @@ namespace executorch::jni_helper {
 void throwExecutorchException(uint32_t errorCode, const std::string& details) {
   // Get the current JNI environment
   auto env = facebook::jni::Environment::current();
+  if (!env) {
+    return;
+  }
 
-  // Find the Java ExecutorchRuntimeException class
-  static auto exceptionClass = facebook::jni::findClassLocal(
-      "org/pytorch/executorch/ExecutorchRuntimeException");
+  // stable/global class ref — safe to cache
+  static const auto exceptionClass =
+      JExecutorchRuntimeException::javaClassStatic();
 
   // Find the static factory method: makeExecutorchException(int, String)
   static auto makeExceptionMethod =
diff --git a/extension/android/jni/jni_helper.h b/extension/android/jni/jni_helper.h
index 996d75581d3..898c1619d9c 100644
--- a/extension/android/jni/jni_helper.h
+++ b/extension/android/jni/jni_helper.h
@@ -23,4 +23,11 @@ namespace executorch::jni_helper {
  */
 void throwExecutorchException(uint32_t errorCode, const std::string& details);
 
+// Define the JavaClass wrapper
+struct JExecutorchRuntimeException
+    : public facebook::jni::JavaClass<JExecutorchRuntimeException> {
+  static constexpr auto kJavaDescriptor =
+      "Lorg/pytorch/executorch/ExecutorchRuntimeException;";
+};
+
 } // namespace executorch::jni_helper
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index cabf30c42e4..a0c90991bf7 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -140,13 +140,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature,
-      facebook::jni::alias_ref<jstring> data_path) {
+      facebook::jni::alias_ref<jobject> data_files) {
     return makeCxxInstance(
         model_type_category,
         model_path,
         tokenizer_path,
         temperature,
-        data_path);
+        data_files);
   }
 
   ExecuTorchLlmJni(
@@ -154,7 +154,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature,
-      facebook::jni::alias_ref<jstring> data_path = nullptr) {
+      facebook::jni::alias_ref<jobject> data_files = nullptr) {
     temperature_ = temperature;
 #if defined(ET_USE_THREADPOOL)
     // Reserve 1 thread for the main thread.
@@ -173,18 +173,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           model_path->toStdString().c_str(),
           llm::load_tokenizer(tokenizer_path->toStdString()));
     } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
-      std::optional<const std::string> data_path_str = data_path
-          ? std::optional<const std::string>{data_path->toStdString()}
-          : std::nullopt;
+      std::vector<std::string> data_files_vector;
+      if (data_files != nullptr) {
+        // Convert Java List<String> to C++ std::vector<string>
+        auto list_class = facebook::jni::findClassStatic("java/util/List");
+        auto size_method = list_class->getMethod<jint()>("size");
+        auto get_method =
+            list_class->getMethod<facebook::jni::local_ref<jobject>(jint)>(
+                "get");
+
+        jint size = size_method(data_files);
+        for (jint i = 0; i < size; ++i) {
+          auto str_obj = get_method(data_files, i);
+          auto jstr = facebook::jni::static_ref_cast<jstring>(str_obj);
+          data_files_vector.push_back(jstr->toStdString());
+        }
+      }
       runner_ = executorch::extension::llm::create_text_llm_runner(
           model_path->toStdString(),
           llm::load_tokenizer(tokenizer_path->toStdString()),
-          data_path_str);
+          data_files_vector);
 #if defined(EXECUTORCH_BUILD_QNN)
     } else if (model_type_category == MODEL_TYPE_QNN_LLAMA) {
       std::unique_ptr<executorch::extension::Module> module = std::make_unique<
           executorch::extension::Module>(
           model_path->toStdString().c_str(),
+          data_files_set,
           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
       std::string decoder_model = "llama3"; // use llama3 for now
       runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
@@ -192,7 +206,6 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           decoder_model.c_str(),
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
-          data_path->toStdString().c_str(),
           "");
       model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
 #endif
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
index cda9a914bc3..9b8400d739f 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
@@ -126,14 +126,14 @@ NS_SWIFT_NAME(Module)
  * Initializes a module with a file path, data path and a specified load mode.
  *
  * @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
  * external tensors and external data.
  * @param loadMode A value from ExecuTorchModuleLoadMode that determines the
  * file loading behavior.
  * @return An initialized ExecuTorchModule instance.
  */
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths
                         loadMode:(ExecuTorchModuleLoadMode)loadMode
     NS_DESIGNATED_INITIALIZER;
 
@@ -141,12 +141,12 @@ NS_SWIFT_NAME(Module)
  * Initializes a module with a file path, data path and a specified load mode.
  *
  * @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
  * external tensors and external data.
  * @return An initialized ExecuTorchModule instance.
  */
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath;
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths;
 
 /**
  * Initializes a module with a file path and a specified load mode.
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
index ce58f2fb21a..69bb59c860e 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
@@ -250,13 +250,20 @@ @implementation ExecuTorchModule {
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths
                         loadMode:(ExecuTorchModuleLoadMode)loadMode {
   self = [super init];
   if (self) {
+    // Convert NSArray<NSString *> to std::vector<std::string>
+    std::vector<std::string> dataFilePathsVector;
+    if (dataFilePaths != nil) {
+      for (NSString *dataFile in dataFilePaths) {
+        dataFilePathsVector.emplace_back(dataFile.UTF8String);
+      }
+    }
     _module = std::make_unique<Module>(
       filePath.UTF8String,
-      dataFilePath.UTF8String,
+      dataFilePathsVector,
       static_cast<Module::LoadMode>(loadMode)
     );
     _inputs = [NSMutableDictionary new];
@@ -266,21 +273,21 @@ - (instancetype)initWithFilePath:(NSString *)filePath
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath {
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths {
   return [self initWithFilePath:filePath
-                   dataFilePath:dataFilePath
+                  dataFilePaths:dataFilePaths
                        loadMode:ExecuTorchModuleLoadModeFile];
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
                         loadMode:(ExecuTorchModuleLoadMode)loadMode {
   return [self initWithFilePath:filePath
-                   dataFilePath:@""
+                  dataFilePaths:@[]
                        loadMode:loadMode];
 }
 - (instancetype)initWithFilePath:(NSString *)filePath {
   return [self initWithFilePath:filePath
-                   dataFilePath:@""
+                  dataFilePaths:@[]
                        loadMode:ExecuTorchModuleLoadModeFile];
 }
 
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
index 3a2b640b7d7..3b1c06a5aa0 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
@@ -129,7 +129,7 @@ - (instancetype)initWithNativeInstance:(void *)nativeInstance {
 - (instancetype)initWithTensor:(ExecuTorchTensor *)otherTensor {
   ET_CHECK(otherTensor);
   auto tensor = make_tensor_ptr(
-    **reinterpret_cast<TensorPtr *>(otherTensor.nativeInstance)
+    *reinterpret_cast<TensorPtr *>(otherTensor.nativeInstance)
   );
   return [self initWithNativeInstance:&tensor];
 }
@@ -271,7 +271,7 @@ - (NSString *)description {
       ET_CHECK_MSG(false, "Unsupported dtype in description");
     }
   } ctx;
-  ET_SWITCH_REALHBBF16_TYPES(
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
     static_cast<ScalarType>(_tensor->scalar_type()),
     ctx,
     "description",
diff --git a/extension/data_loader/CMakeLists.txt b/extension/data_loader/CMakeLists.txt
index a5e7a0c4a81..b45ba0594e1 100644
--- a/extension/data_loader/CMakeLists.txt
+++ b/extension/data_loader/CMakeLists.txt
@@ -41,7 +41,7 @@ target_compile_options(extension_data_loader PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_data_loader
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/evalue_util/print_evalue.cpp b/extension/evalue_util/print_evalue.cpp
index 32009011012..83d71cffb43 100644
--- a/extension/evalue_util/print_evalue.cpp
+++ b/extension/evalue_util/print_evalue.cpp
@@ -160,7 +160,7 @@ void print_tensor(std::ostream& os, executorch::aten::Tensor tensor) {
     break;
 
   switch (tensor.scalar_type()) {
-    ET_FORALL_REAL_TYPES_AND2(Bool, Half, PRINT_TENSOR_DATA)
+    ET_FORALL_REALHBBF16_TYPES(PRINT_TENSOR_DATA)
     default:
       os << "[<unhandled scalar type " << (int)tensor.scalar_type() << ">]";
   }
diff --git a/extension/flat_tensor/CMakeLists.txt b/extension/flat_tensor/CMakeLists.txt
index ff70bcc9565..9a0ad782ef5 100644
--- a/extension/flat_tensor/CMakeLists.txt
+++ b/extension/flat_tensor/CMakeLists.txt
@@ -31,7 +31,7 @@ target_compile_options(extension_flat_tensor PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_flat_tensor
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
index 478ce9d63cf..515bfe93c28 100644
--- a/extension/flat_tensor/flat_tensor_data_map.cpp
+++ b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -55,7 +55,7 @@ Result<const flat_tensor_flatbuffer::NamedData*> get_named_data(
   if (named_data == nullptr) {
     return Error::NotFound;
   }
-  for (int i = 0; i < named_data->size(); i++) {
+  for (flatbuffers::uoffset_t i = 0; i < named_data->size(); ++i) {
     if (key.size() == named_data->Get(i)->key()->size() &&
         std::strncmp(
             named_data->Get(i)->key()->c_str(),
diff --git a/extension/flat_tensor/serialize/TARGETS b/extension/flat_tensor/serialize/TARGETS
index 229f6930f4e..b9ccadf9f23 100644
--- a/extension/flat_tensor/serialize/TARGETS
+++ b/extension/flat_tensor/serialize/TARGETS
@@ -13,6 +13,9 @@ runtime.python_library(
     visibility = [
         "//executorch/...",
     ],
+    deps = [
+        "//executorch/exir:tensor_layout",
+    ]
 )
 
 runtime.python_library(
diff --git a/extension/flat_tensor/serialize/flat_tensor.fbs b/extension/flat_tensor/serialize/flat_tensor.fbs
index abf331697d6..4b71e13e2c4 100644
--- a/extension/flat_tensor/serialize/flat_tensor.fbs
+++ b/extension/flat_tensor/serialize/flat_tensor.fbs
@@ -7,6 +7,8 @@ namespace flat_tensor_flatbuffer;
 file_identifier "FT01";
 file_extension "ptd";
 
+// Note: keep this in sync with the python definition in
+// executorch/exir/tensor_layout.py
 table TensorLayout {
   scalar_type: executorch_flatbuffer.ScalarType;
 
diff --git a/extension/flat_tensor/serialize/flat_tensor_header.cpp b/extension/flat_tensor/serialize/flat_tensor_header.cpp
index b329015e4ce..b055d222465 100644
--- a/extension/flat_tensor/serialize/flat_tensor_header.cpp
+++ b/extension/flat_tensor/serialize/flat_tensor_header.cpp
@@ -14,7 +14,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
+#if defined(__clang__)
 #pragma clang diagnostic ignored "-Wdeprecated"
+#endif
 
 namespace executorch {
 using runtime::Error;
diff --git a/extension/flat_tensor/serialize/flat_tensor_schema.py b/extension/flat_tensor/serialize/flat_tensor_schema.py
index 53b0fe98ea9..2fcf2c6eb81 100644
--- a/extension/flat_tensor/serialize/flat_tensor_schema.py
+++ b/extension/flat_tensor/serialize/flat_tensor_schema.py
@@ -9,18 +9,11 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
-from executorch.exir.scalar_type import ScalarType
+from executorch.exir.tensor_layout import TensorLayout
 
 # Note: check executorch/extension/data_format/flat_tensor.fbs for explanations of these fields.
 
 
-@dataclass
-class TensorLayout:
-    scalar_type: ScalarType
-    sizes: List[int]
-    dim_order: List[int]
-
-
 @dataclass
 class DataSegment:
     offset: int
diff --git a/extension/flat_tensor/test/test_serialize.py b/extension/flat_tensor/test/test_serialize.py
index 13402e60a65..726a8845c2e 100644
--- a/extension/flat_tensor/test/test_serialize.py
+++ b/extension/flat_tensor/test/test_serialize.py
@@ -22,7 +22,7 @@
 from executorch.exir._serialize.padding import aligned_size
 
 from executorch.exir.schema import ScalarType
-from executorch.extension.flat_tensor.serialize.flat_tensor_schema import TensorLayout
+from executorch.exir.tensor_layout import TensorLayout
 
 from executorch.extension.flat_tensor.serialize.serialize import (
     _deserialize_to_flat_tensor,
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
index 8523581da8a..b2e36e0a1f2 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -44,6 +44,12 @@ __attribute__((objc_subclassing_restricted))
                     channels:(NSInteger)channels
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
 
 @property(nonatomic, readonly) NSInteger width;
@@ -52,6 +58,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) NSInteger channels;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -80,6 +88,12 @@ __attribute__((objc_subclassing_restricted))
                       frames:(NSInteger)frames
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
 
 @property(nonatomic, readonly) NSInteger batchSize;
@@ -88,6 +102,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) NSInteger frames;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -129,7 +145,7 @@ __attribute__((objc_subclassing_restricted))
  @return A retained ExecuTorchLLMMultimodalInput instance of type Audio.
 */
 + (instancetype)inputWithAudio:(ExecuTorchLLMAudio *)audio
-    NS_SWIFT_NAME(init(audio:))
+    NS_SWIFT_NAME(init(_:))
     NS_RETURNS_RETAINED;
 
 @property(nonatomic, readonly) ExecuTorchLLMMultimodalInputType type;
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
index a3dc3e6afd1..964805053e2 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -32,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data
     _width = width;
     _height = height;
     _channels = channels;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _width = width;
+    _height = height;
+    _channels = channels;
+    _isFloat = YES;
   }
   return self;
 }
@@ -53,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data
     _batchSize = batchSize;
     _bins = bins;
     _frames = frames;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _batchSize = batchSize;
+    _bins = bins;
+    _frames = frames;
+    _isFloat = YES;
   }
   return self;
 }
@@ -170,6 +202,7 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
     return NO;
   }
   std::vector<llm::MultimodalInput> nativeInputs;
+  nativeInputs.reserve((size_t)inputs.count);
   for (ExecuTorchLLMMultimodalInput *input in inputs) {
     switch (input.type) {
       case ExecuTorchLLMMultimodalInputTypeText:
@@ -177,13 +210,50 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
         break;
       case ExecuTorchLLMMultimodalInputTypeImage: {
         ExecuTorchLLMImage *image = input.image;
-        std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
-        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
-          std::move(data),
-          (int32_t)image.width,
-          (int32_t)image.height,
-          (int32_t)image.channels
-        )));
+        if (image.isFloat) {
+          const float *buffer = (const float *)image.data.bytes;
+          size_t elementCount = (size_t)image.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)image.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + image.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        }
+        break;
+      }
+      case ExecuTorchLLMMultimodalInputTypeAudio: {
+        ExecuTorchLLMAudio *audio = input.audio;
+        if (audio.isFloat) {
+          const float *buffer = (const float *)audio.data.bytes;
+          size_t elementCount = (size_t)audio.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + audio.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        }
         break;
       }
       default: {
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
index 550a20ea633..50957ee47f5 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
@@ -25,12 +25,23 @@ __attribute__((deprecated("This API is experimental.")))
 
  @param modelPath      File system path to the serialized model.
  @param tokenizerPath  File system path to the tokenizer data.
- @param tokens         An array of NSString special tokens to use during tokenization.
+ @return An initialized ExecuTorchLLMTextRunner instance.
+*/
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath;
+
+/**
+ Initializes a text LLM runner with the given model and tokenizer paths,
+ and a list of special tokens to include in the tokenizer.
+
+ @param modelPath      File system path to the serialized model.
+ @param tokenizerPath  File system path to the tokenizer data.
+ @param specialTokens  An array of NSString special tokens to use during tokenization.
  @return An initialized ExecuTorchLLMTextRunner instance.
 */
 - (instancetype)initWithModelPath:(NSString *)modelPath
                     tokenizerPath:(NSString *)tokenizerPath
-                    specialTokens:(NSArray<NSString *> *)tokens
+                    specialTokens:(NSArray<NSString *> *)specialTokens
     NS_DESIGNATED_INITIALIZER;
 
 /**
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
index 4ea1bd921f7..1a6c3f40045 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
@@ -28,15 +28,22 @@ @implementation ExecuTorchLLMTextRunner {
   std::unique_ptr<llm::TextLLMRunner> _runner;
 }
 
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath {
+  return [self initWithModelPath:modelPath
+                   tokenizerPath:tokenizerPath
+                   specialTokens:@[]];
+}
+
 - (instancetype)initWithModelPath:(NSString*)modelPath
                     tokenizerPath:(NSString*)tokenizerPath
-                    specialTokens:(NSArray<NSString*>*)tokens {
+                    specialTokens:(NSArray<NSString*>*)specialTokens {
   self = [super init];
   if (self) {
     _modelPath = [modelPath copy];
     _tokenizerPath = [tokenizerPath copy];
     _specialTokens = std::make_unique<std::vector<std::string>>();
-    for (NSString *token in tokens) {
+    for (NSString *token in specialTokens) {
       _specialTokens->emplace_back(token.UTF8String);
     }
   }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
index cdf15f12350..3617245b8f8 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -6,64 +6,163 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+import ExecuTorch
 import ExecuTorchLLM
 import XCTest
 
 extension UIImage {
-  func asImage() -> Image {
-    let targetSide = CGFloat(336)
-    let scale = max(targetSide / size.width, targetSide / size.height)
-    let scaledSize = CGSize(width: size.width * scale, height: size.height * scale)
+  func centerCropped(to sideSize: CGFloat) -> UIImage {
+    precondition(sideSize > 0)
     let format = UIGraphicsImageRendererFormat.default()
     format.scale = 1
-    let scaledImage = UIGraphicsImageRenderer(size: scaledSize, format: format).image { _ in
-      draw(in: CGRect(origin: .zero, size: scaledSize))
-    }
-    guard let scaledCGImage = scaledImage.cgImage else {
-      return Image(data: Data(), width: 336, height: 336, channels: 3)
-    }
-    let cropRect = CGRect(
-      x: ((scaledSize.width - targetSide) * 0.5).rounded(.down),
-      y: ((scaledSize.height - targetSide) * 0.5).rounded(.down),
-      width: targetSide.rounded(.down),
-      height: targetSide.rounded(.down)
-    )
-    let croppedCGImage = scaledCGImage.cropping(to: cropRect) ?? scaledCGImage
-    let imageWidth = croppedCGImage.width
-    let imageHeight = croppedCGImage.height
-    let pixelCount = imageWidth * imageHeight
-    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4)
-    let context = CGContext(
+    format.opaque = false
+    return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format)
+      .image { _ in
+        let scaleFactor = max(sideSize / size.width, sideSize / size.height)
+        let scaledWidth = size.width * scaleFactor
+        let scaledHeight = size.height * scaleFactor
+        let originX = (sideSize - scaledWidth) / 2
+        let originY = (sideSize - scaledHeight) / 2
+        draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight))
+      }
+  }
+
+  func rgbBytes() -> [UInt8]? {
+    guard let cgImage = cgImage else { return nil }
+    let pixelWidth = Int(cgImage.width)
+    let pixelHeight = Int(cgImage.height)
+    let pixelCount = pixelWidth * pixelHeight
+    let bytesPerPixel = 4
+    let bytesPerRow = pixelWidth * bytesPerPixel
+    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
+    guard let context = CGContext(
       data: &rgbaBuffer,
-      width: imageWidth,
-      height: imageHeight,
+      width: pixelWidth,
+      height: pixelHeight,
       bitsPerComponent: 8,
-      bytesPerRow: imageWidth * 4,
+      bytesPerRow: bytesPerRow,
       space: CGColorSpaceCreateDeviceRGB(),
       bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
-    )!
-    context.draw(croppedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
-    var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3)
+    ) else { return nil }
+
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight))
+
+    var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
     for pixelIndex in 0..<pixelCount {
-      let sourceOffset = pixelIndex * 4
-      planarRGB[pixelIndex] = rgbaBuffer[sourceOffset]
-      planarRGB[pixelIndex + pixelCount] = rgbaBuffer[sourceOffset + 1]
-      planarRGB[pixelIndex + pixelCount * 2] = rgbaBuffer[sourceOffset + 2]
+      let sourceIndex = pixelIndex * bytesPerPixel
+      rgbBytes[pixelIndex] = rgbaBuffer[sourceIndex + 0]
+      rgbBytes[pixelIndex + pixelCount] = rgbaBuffer[sourceIndex + 1]
+      rgbBytes[pixelIndex + 2 * pixelCount] = rgbaBuffer[sourceIndex + 2]
     }
-    return Image(data: Data(planarRGB), width: 336, height: 336, channels: 3)
+    return rgbBytes
+  }
+
+  func rgbBytesNormalized(mean: [Float] = [0, 0, 0], std: [Float] = [1, 1, 1]) -> [Float]? {
+    precondition(mean.count == 3 && std.count == 3)
+    precondition(std[0] != 0 && std[1] != 0 && std[2] != 0)
+    guard let rgbBytes = rgbBytes() else { return nil }
+    let pixelCount = rgbBytes.count / 3
+    var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3)
+    for pixelIndex in 0..<pixelCount {
+      rgbBytesNormalized[pixelIndex] =
+        (Float(rgbBytes[pixelIndex]) / 255.0 - mean[0]) / std[0]
+      rgbBytesNormalized[pixelIndex + pixelCount] =
+        (Float(rgbBytes[pixelIndex + pixelCount]) / 255.0 - mean[1]) / std[1]
+      rgbBytesNormalized[pixelIndex + 2 * pixelCount] =
+        (Float(rgbBytes[pixelIndex + 2 * pixelCount]) / 255.0 - mean[2]) / std[2]
+    }
+    return rgbBytesNormalized
+  }
+
+  func asImage(_ sideSize: CGFloat) -> Image {
+    return Image(
+      data: Data(centerCropped(to: sideSize).rgbBytes() ?? []),
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
+  }
+
+  func asNormalizedImage(
+    _ sideSize: CGFloat,
+    mean: [Float] = [0.485, 0.456, 0.406],
+    std: [Float] = [0.229, 0.224, 0.225]
+  ) -> Image {
+    return Image(
+      float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) },
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
   }
 }
 
 class MultimodalRunnerTest: XCTestCase {
-  let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "
-  let assistantPrompt = "ASSISTANT: "
-  let userPrompt = "What's on the picture?"
-  let sequenceLength = 768
+  let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+  func testGemma() {
+    let chatTemplate = "<start_of_turn>user\n%@<end_of_turn>\n<start_of_turn>model"
+    let userPrompt = "What's on the picture?"
+    let sideSize: CGFloat = 896
+    let sequenceLength = 768
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "gemma3", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "gemma3_tokenizer", ofType: "model"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
+          let uiImage = UIImage(contentsOfFile: imagePath) else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
 
-  func test() {
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+  }
+
+  func testLLaVA() {
+    let chatTemplate = "USER: %@ ASSISTANT: "
+    let userPrompt = "What's on the picture?"
+    let sideSize: CGFloat = 336
+    let sequenceLength = 768
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
+          let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"),
           let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
           let uiImage = UIImage(contentsOfFile: imagePath) else {
       XCTFail("Couldn't find model or tokenizer files")
@@ -75,8 +174,8 @@ class MultimodalRunnerTest: XCTestCase {
     do {
       try runner.generate([
         MultimodalInput(systemPrompt),
-        MultimodalInput(uiImage.asImage()),
-        MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
       ], Config {
         $0.sequenceLength = sequenceLength
       }) { token in
@@ -92,8 +191,8 @@ class MultimodalRunnerTest: XCTestCase {
     do {
       try runner.generate([
         MultimodalInput(systemPrompt),
-        MultimodalInput(uiImage.asImage()),
-        MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
       ], Config {
         $0.sequenceLength = sequenceLength
       }) { token in
@@ -104,4 +203,47 @@ class MultimodalRunnerTest: XCTestCase {
     }
     XCTAssertTrue(text.lowercased().contains("waterfall"))
   }
+
+  func testVoxtral() throws {
+    let chatTemplate = "%@[/INST]"
+    let userPrompt = "What is the audio about?"
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "voxtral", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "voxtral_tokenizer_tekken", ofType: "json"),
+          let audioPath = bundle.path(forResource: "voxtral_input_features", ofType: "bin") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var audioData = try Data(contentsOf: URL(fileURLWithPath: audioPath), options: .mappedIfSafe)
+    let floatSize = MemoryLayout<Float>.size
+    guard audioData.count % floatSize == 0 else {
+      XCTFail("Invalid audio data")
+      return
+    }
+    let bins = 128
+    let frames = 3000
+    let batchSize = audioData.count / floatSize / (bins * frames)
+    var text = ""
+
+    do {
+      try runner.generate([
+        MultimodalInput("<s>[INST][BEGIN_AUDIO]"),
+        MultimodalInput(Audio(
+          float: audioData,
+          batchSize: batchSize,
+          bins: bins,
+          frames: frames
+        )),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.maximumNewTokens = 256
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("tattoo"))
+  }
 }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
index 5e99af0c57f..0fa2b59d05d 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -39,10 +39,10 @@ class TextRunnerTest: XCTestCase {
   let userPrompt = "The capital of France is called"
   let sequenceLength = 128
 
-  func test() {
+  func testLLaMA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
+          let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
@@ -73,4 +73,39 @@ class TextRunnerTest: XCTestCase {
     }
     XCTAssertTrue(text.lowercased().contains("paris"))
   }
+
+  func testPhi4() {
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = TextRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+  }
 }
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 1678dc80296..2cdfe547430 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -16,9 +16,14 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
-  list(APPEND _common_compile_options "-march=armv8.2-a+dotprod")
+  list(APPEND _common_compile_options
+       "$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-march=armv8.2-a+dotprod>"
+  )
 endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
@@ -83,7 +88,7 @@ target_compile_options(custom_ops PUBLIC ${_common_compile_options})
 install(
   TARGETS custom_ops
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
 if(EXECUTORCH_BUILD_KERNELS_LLM_AOT)
@@ -135,8 +140,15 @@ if(EXECUTORCH_BUILD_KERNELS_LLM_AOT)
     target_link_libraries(custom_ops_aot_lib PUBLIC pthreadpool cpuinfo)
   endif()
   target_compile_options(
-    custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti
-                              -fexceptions ${_common_compile_options}
+    custom_ops_aot_lib
+    PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+           /GR
+           /wd4996>
+           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+           -fPIC
+           -frtti
+           -fexceptions>
+           ${_common_compile_options}
   )
 
   install(
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 01000f3564c..da5c3324662 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -142,9 +142,14 @@ def __init__(
                     {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
                 )
             else:
-                # Two input arguments: tokens and input_pos but input_pos is static shape
+                # Two input arguments: tokens and input_pos but input_pos is static shape.
+
+                # A runtime assertion is added by torch.ops.llama.update_cache requires that
+                # L['tokens'].size()[1] + input_pos[0].item() < self.max_seq_len
+                # This consttaint L['tokens'].size()[1] to be elf.max_seq_len-1
+                # run with TORCH_LOGS=+dynamic for details
                 self.dynamic_shapes = (
-                    {1: torch.export.Dim("token_dim", max=self.max_seq_len)},
+                    {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
                     {"input_pos": {0: 1}},
                 )
 
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index b13001c005b..223e5335994 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -215,9 +215,10 @@ class ExportConfig:
         so_library: Shared library to specify custom quantized operators.
         export_only: Whether to stop right after torch.export() and
             just save the exported .pt2 graph file.
-        foundation_weights_file: configure the foundation weights of a model
-            to be placed in a separate file, external to the PTE. Pass the
-            intended file name here.
+        foundation_weights_file: place the foundation weights of the model into
+            a separate file, external to the PTE. Pass the file name here.
+        lora_weights_file: place the lora weights of the model into a
+            separate file, external to the PTE. Pass the file name here.
     """
 
     max_seq_length: int = 128
@@ -227,6 +228,7 @@ class ExportConfig:
     so_library: Optional[str] = None
     export_only: bool = False
     foundation_weights_file: Optional[str] = None
+    lora_weights_file: Optional[str] = None
 
     def __post_init__(self):
         if self.max_context_length < self.max_seq_length:
@@ -279,6 +281,8 @@ class Pt2eQuantize(str, Enum):
 
     xnnpack_dynamic = "xnnpack_dynamic"
     xnnpack_dynamic_qc4 = "xnnpack_dynamic_qc4"
+    openvino_4wo = "openvino_4wo"
+    openvino_8wo = "openvino_8wo"
     qnn_8a8w = "qnn_8a8w"
     qnn_16a16w = "qnn_16a16w"
     qnn_16a4w = "qnn_16a4w"
@@ -452,6 +456,18 @@ class MPSConfig:
     enabled: bool = False
 
 
+@dataclass
+class OpenvinoConfig:
+    """
+    Configures the QNN backend.
+    """
+
+    enabled: bool = False
+    device: str = "CPU"
+    nncf_compression: bool = False
+    nncf_compression_group_size: int = 32
+
+
 @dataclass
 class TorchAOKernelsConfig:
     """
@@ -474,6 +490,7 @@ class BackendConfig:
     vulkan: VulkanConfig = field(default_factory=VulkanConfig)
     qnn: QNNConfig = field(default_factory=QNNConfig)
     mps: MPSConfig = field(default_factory=MPSConfig)
+    openvino: OpenvinoConfig = field(default_factory=OpenvinoConfig)
     torchao: TorchAOKernelsConfig = field(default_factory=TorchAOKernelsConfig)
 
 
@@ -572,6 +589,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.export.export_only = args.export_only
         if hasattr(args, "foundation_weights_file"):
             llm_config.export.foundation_weights_file = args.foundation_weights_file
+        if hasattr(args, "lora_weights_file"):
+            llm_config.export.lora_weights_file = args.lora_weights_file
 
         # QuantizationConfig
         if hasattr(args, "quantization_mode"):
@@ -643,6 +662,16 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if hasattr(args, "mps"):
             llm_config.backend.mps.enabled = args.mps
 
+        # Openvino
+        if hasattr(args, "openvino"):
+            llm_config.backend.openvino.enabled = args.openvino
+        if hasattr(args, "openvino_device"):
+            llm_config.backend.openvino.device = args.openvino_device
+        if hasattr(args, "nncf_compression"):
+            llm_config.backend.openvino.nncf_compression = args.nncf_compression
+        if hasattr(args, "group_size") and args.group_size:
+            llm_config.backend.openvino.nncf_compression_group_size = args.group_size
+
         # TorchAoKernels
         if any(
             hasattr(args, a)
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 5fe220f7dd9..03ac2bd91e4 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -68,6 +68,19 @@ def get_mps_partitioner(use_kv_cache: bool = False):
     return MPSPartitioner(compile_specs)  # pyre-fixme[16]
 
 
+def get_openvino_partitioner(device: str):
+    try:
+        from executorch.backends.openvino.partitioner import OpenvinoPartitioner
+        from executorch.exir.backend.backend_details import CompileSpec
+    except ImportError:
+        raise ImportError(
+            "Please install the OpenVINO backend following https://github.com/pytorch/executorch/tree/main/backends/openvino"
+        )
+
+    compile_specs = [CompileSpec("device", device.encode())]
+    return OpenvinoPartitioner(compile_specs)
+
+
 def get_coreml_partitioner(
     ios: int = 15,
     embedding_quantize: Optional[str] = None,
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 2d87c86d113..592a6666dfa 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -215,6 +215,47 @@ def get_qnn_quantizer(
     return qnn_quantizer, quant_dtype
 
 
+def get_ov_quantizer(
+    pt2e_quantize: str,
+    group_size: int = 128,
+):
+    try:
+        from executorch.backends.openvino.quantizer import (
+            OpenVINOQuantizer,
+            QuantizationMode,
+        )
+    except ImportError:
+        raise ImportError("Please install nncf via backends/openvino/requirements.txt")
+
+    backend, quant_config = pt2e_quantize.split("_")
+    assert (
+        backend == "openvino"
+    ), f"The quantization config is for backend {backend} instead of openvino."
+    assert (
+        group_size
+    ), "Group Size None is Not Supported. It should be set to -1 for per-channel."
+
+    quantization_params = {}
+
+    if quant_config == "4wo":
+        quantization_params["mode"] = QuantizationMode.INT4WO_SYM
+        quantization_params["group_size"] = group_size
+        quantization_params["ratio"] = 1
+
+    elif quant_config == "8wo":
+        quantization_params["mode"] = QuantizationMode.INT8WO_ASYM
+        quantization_params["group_size"] = -1
+        quantization_params["ratio"] = None
+
+    else:
+        raise AssertionError(
+            f"No support for quant type {quant_config}. Support 8a4w, 8a8w only."
+        )
+    ov_quantizer = OpenVINOQuantizer(**quantization_params)
+
+    return ov_quantizer
+
+
 def get_coreml_quantizer(pt2e_quantize: str):
     try:
         from coremltools.optimize.torch.quantization.quantization_config import (
diff --git a/extension/llm/export/test/test_builder.py b/extension/llm/export/test/test_builder.py
index 8bf591813ec..7883480c1e7 100644
--- a/extension/llm/export/test/test_builder.py
+++ b/extension/llm/export/test/test_builder.py
@@ -88,7 +88,7 @@ def test_get_dynamic_shape_with_dynamic_shape_enabled_with_kv_cache(self) -> Non
         # Check first element (tokens dimension)
         self.assertIsInstance(result[0], dict)
         self.assertIn(1, result[0])
-        self.assertEqual(result[0][1].max, self.max_seq_len)
+        self.assertEqual(result[0][1].max, self.max_seq_len - 1)
 
         # Check second element (input_pos dimension)
         self.assertIsInstance(result[1], dict)
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index f12de5f1d87..d1e4ff2ce45 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -183,6 +183,24 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
     std::optional<const std::string> data_path,
     float temperature) {
+  if (data_path.has_value()) {
+    std::vector<std::string> data_files;
+    data_files.push_back(data_path.value());
+    return create_text_llm_runner(
+        model_path, std::move(tokenizer), std::move(data_files), temperature);
+  }
+  return create_text_llm_runner(
+      model_path,
+      std::move(tokenizer),
+      std::vector<std::string>(),
+      temperature);
+}
+
+std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::vector<std::string> data_files,
+    float temperature) {
   // Sanity check tokenizer
   if (!tokenizer || !tokenizer->is_loaded()) {
     ET_LOG(Error, "Tokenizer is null or not loaded");
@@ -191,9 +209,9 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create the Module
   std::unique_ptr<Module> module;
-  if (data_path.has_value()) {
+  if (data_files.size() > 0) {
     module = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
+        model_path, data_files, Module::LoadMode::File);
   } else {
     module = std::make_unique<Module>(model_path, Module::LoadMode::File);
   }
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index 191ea3ab090..5c109581e19 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -101,7 +101,28 @@ ET_EXPERIMENTAL std::unordered_set<uint64_t> get_eos_ids(
 ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::optional<const std::string> data_path = std::nullopt,
+    std::optional<const std::string> data_path,
+    float temperature = -1.0f);
+
+/**
+ * @brief Creates a TextLLMRunner instance with dependency injection
+ *
+ * This factory function creates and initializes a TextLLMRunner with all
+ * necessary components for text generation using the specified model and
+ * tokenizer.
+ *
+ * @param model_path Path to the model file
+ * @param tokenizer Initialized tokenizer instance
+ * @param data_files Vector of paths to additional data required by the model
+ * @param temperature Optional temperature parameter for controlling randomness
+ * (deprecated)
+ * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance, or
+ * nullptr on failure
+ */
+ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::vector<std::string> data_files = {},
     float temperature = -1.0f);
 
 /**
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 2c83df24f55..7f5a8356979 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -40,7 +40,7 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   // 1. Run encoder model.
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
-    Image image = input.get_image();
+    const Image& image = input.get_image();
 
     auto method_meta = ET_UNWRAP(
         module_->method_meta(kVisionEncoderMethod),
@@ -67,11 +67,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(
           InvalidArgument,
           "Model expects uint8_t image data, but image has float data.");
     } else {
-      ET_LOG(
-          Error,
+      ET_CHECK_OR_RETURN_ERROR(
+          false,
+          NotSupported,
           "Unsupported image encoder input dtype: %s",
           ::executorch::runtime::toString(expected_dtype));
-      return ::executorch::runtime::Error::NotSupported;
     }
 
     // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
@@ -91,16 +91,49 @@ Result<uint64_t> MultimodalPrefiller::prefill(
 
     encoder_output = image_encoder_outputs[0];
   } else if (input.is_audio()) {
-    Audio audio = input.get_audio();
+    const Audio& audio = input.get_audio();
+
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kAudioEncoderMethod),
+        "Failed to get method_meta for %s",
+        kAudioEncoderMethod);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        method_meta.num_inputs() > 0,
+        InvalidArgument,
+        "Audio encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
 
-    // Use Audio::toTensor() for tensor creation
+    // Create tensor with original dtype
     auto audio_tensor =
         ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+
+    // Convert to expected dtype if needed
+    if (audio_tensor->scalar_type() != expected_dtype) {
+      if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+        // Convert to bfloat16
+        audio_tensor = ET_UNWRAP(
+            convert_to_bfloat16(audio_tensor),
+            "Failed to convert audio tensor to bfloat16");
+      } else {
+        ET_CHECK_OR_RETURN_ERROR(
+            false,
+            NotSupported,
+            "Unsupported audio encoder input dtype: %s. Expecting %s",
+            ::executorch::runtime::toString(audio_tensor->scalar_type()),
+            ::executorch::runtime::toString(expected_dtype));
+      }
+    }
+
     ET_LOG(
         Info,
         "Audio tensor dim: %zu, dtype: %s",
         audio_tensor->dim(),
         ::executorch::runtime::toString(audio_tensor->scalar_type()));
+
     // Run audio encoder
     auto audio_encoder_result =
         module_->execute(kAudioEncoderMethod, audio_tensor);
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index c1c99ad6c9f..8b7e4e315d8 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -62,11 +62,6 @@ Error MultimodalRunner::load() {
     ET_LOG(Info, format, __VA_ARGS__);     \
   }
 
-Error MultimodalRunner::prefill(std::vector<MultimodalInput>&& inputs) {
-  // Forward to the const reference version
-  return prefill(inputs);
-}
-
 Error MultimodalRunner::prefill(const std::vector<MultimodalInput>& inputs) {
   if (!is_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
@@ -77,16 +72,6 @@ Error MultimodalRunner::prefill(const std::vector<MultimodalInput>& inputs) {
   return Error::Ok;
 }
 
-Error MultimodalRunner::generate(
-    std::vector<MultimodalInput>&& inputs,
-    const GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  // Forward to the const reference version
-  return generate(
-      inputs, config, std::move(token_callback), std::move(stats_callback));
-}
-
 Error MultimodalRunner::generate(
     const std::vector<MultimodalInput>& inputs,
     const GenerationConfig& config,
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index eccf5bde301..caf3c296038 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -119,21 +119,6 @@ class ET_EXPERIMENTAL MultimodalRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
 
-  /**
-   * Generate tokens from multimodal inputs with move semantics.
-   * This overload allows efficient transfer of temporary vectors.
-   * @param inputs A vector of MultimodalInput objects (moved).
-   * @param config Generation configuration parameters.
-   * @param token_callback Callback function called for each generated token.
-   * @param stats_callback Callback function for generation statistics.
-   * @return The error code. KV cache position is tracked internally in pos_.
-   */
-  virtual ::executorch::runtime::Error generate(
-      std::vector<MultimodalInput>&& inputs,
-      const GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {});
-
   /**
    * Prefill multimodal inputs, for example to reload chat history.
    * @param inputs A vector of MultimodalInput objects containing images and
@@ -143,15 +128,6 @@ class ET_EXPERIMENTAL MultimodalRunner {
   virtual ::executorch::runtime::Error prefill(
       const std::vector<MultimodalInput>& inputs);
 
-  /**
-   * Prefill multimodal inputs with move semantics.
-   * This overload allows efficient transfer of temporary vectors.
-   * @param inputs A vector of MultimodalInput objects (moved).
-   * @return The error code. KV cache position is tracked internally in pos_.
-   */
-  virtual ::executorch::runtime::Error prefill(
-      std::vector<MultimodalInput>&& inputs);
-
   inline void stop() {
     text_token_generator_->stop();
   }
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 242860a195a..e001e8fc154 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -105,6 +105,7 @@ def define_common_targets():
             exported_headers = [
                 "audio.h",
                 "image.h",
+                "wav_loader.h",
                 "multimodal_input.h",
                 "multimodal_runner.h",
                 "multimodal_prefiller.h",
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
index 2aa18000831..81b69c0ab9a 100644
--- a/extension/llm/runner/test/CMakeLists.txt
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -18,8 +18,13 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
-    test_generation_config.cpp test_text_llm_runner.cpp test_text_prefiller.cpp
-    test_text_decoder_runner.cpp test_multimodal_input.cpp
+    test_generation_config.cpp
+    test_text_llm_runner.cpp
+    test_text_prefiller.cpp
+    test_text_decoder_runner.cpp
+    test_multimodal_input.cpp
+    test_util.cpp
+    test_wav_loader.cpp
 )
 
 # Add LSan stub for Apple platforms
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
index 3339b3b8584..1109ff315ac 100644
--- a/extension/llm/runner/test/targets.bzl
+++ b/extension/llm/runner/test/targets.bzl
@@ -44,3 +44,23 @@ def define_common_targets():
             "//executorch/extension/llm/runner:multimodal_runner_lib",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_util",
+        srcs = ["test_util.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:stats",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/runtime/core:core",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "test_wav_loader",
+        srcs = ["test_wav_loader.cpp"],
+        deps = [
+            "//executorch/extension/testing_util:temp_file",
+            "//executorch/extension/llm/runner:multimodal_runner_lib",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/llm/runner/test/test_util.cpp b/extension/llm/runner/test/test_util.cpp
new file mode 100644
index 00000000000..242e48e6871
--- /dev/null
+++ b/extension/llm/runner/test/test_util.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+namespace {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::extension::make_tensor_ptr;
+using ::executorch::extension::llm::convert_to_bfloat16;
+
+TEST(ConvertToBFloat16Test, ConvertsFloatTensorData) {
+  auto source_tensor = make_tensor_ptr<float>(
+      {2, 2}, std::vector<float>{0.0f, 1.5f, -2.0f, 3.25f});
+
+  auto result = convert_to_bfloat16(source_tensor);
+  ASSERT_TRUE(result.ok());
+  auto bf16_tensor = *result;
+
+  EXPECT_EQ(bf16_tensor->scalar_type(), ScalarType::BFloat16);
+  EXPECT_EQ(bf16_tensor->numel(), source_tensor->numel());
+
+  auto src_sizes = source_tensor->sizes();
+  auto dst_sizes = bf16_tensor->sizes();
+  ASSERT_EQ(dst_sizes.size(), src_sizes.size());
+  for (size_t dim = 0; dim < dst_sizes.size(); ++dim) {
+    EXPECT_EQ(dst_sizes[dim], src_sizes[dim]);
+  }
+
+  const auto* converted_data = bf16_tensor->const_data_ptr<::c10::BFloat16>();
+  const auto* original_data = source_tensor->const_data_ptr<float>();
+  ASSERT_NE(converted_data, nullptr);
+  ASSERT_NE(original_data, nullptr);
+
+  for (size_t i = 0; i < static_cast<size_t>(source_tensor->numel()); ++i) {
+    EXPECT_NEAR(static_cast<float>(converted_data[i]), original_data[i], 1e-2f);
+  }
+}
+
+TEST(ConvertToBFloat16Test, RejectsNonFloatTensor) {
+  auto non_float_tensor =
+      make_tensor_ptr<int64_t>({3}, std::vector<int64_t>{1, 2, 3});
+
+  auto result = convert_to_bfloat16(non_float_tensor);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), ::executorch::runtime::Error::InvalidArgument);
+}
+
+} // namespace
diff --git a/extension/llm/runner/test/test_wav_loader.cpp b/extension/llm/runner/test/test_wav_loader.cpp
new file mode 100644
index 00000000000..bc3ac0ff324
--- /dev/null
+++ b/extension/llm/runner/test/test_wav_loader.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/wav_loader.h>
+#include <executorch/extension/testing_util/temp_file.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+using executorch::extension::llm::kOneOverIntMax;
+using executorch::extension::llm::kOneOverShortMax;
+using executorch::extension::llm::load_wav_audio_data;
+using executorch::extension::llm::load_wav_header;
+using executorch::extension::llm::WavHeader;
+using executorch::extension::testing::TempFile;
+
+namespace {
+
+// Test fixture to ensure PAL initialization
+class WavLoaderTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Ensure PAL is initialized before tests run
+    executorch::runtime::runtime_init();
+  }
+};
+
+void append_bytes(std::vector<uint8_t>& out, const char* literal) {
+  out.insert(out.end(), literal, literal + 4);
+}
+
+void append_le16(std::vector<uint8_t>& out, uint16_t value) {
+  out.push_back(static_cast<uint8_t>(value & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 8) & 0xFF));
+}
+
+void append_le32(std::vector<uint8_t>& out, uint32_t value) {
+  out.push_back(static_cast<uint8_t>(value & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 8) & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 16) & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 24) & 0xFF));
+}
+
+std::vector<uint8_t> make_pcm_wav_bytes(
+    int bits_per_sample,
+    const std::vector<int32_t>& samples,
+    uint16_t num_channels = 1,
+    uint32_t sample_rate = 16000) {
+  const size_t bytes_per_sample = static_cast<size_t>(bits_per_sample / 8);
+  const uint32_t subchunk2_size =
+      static_cast<uint32_t>(samples.size() * bytes_per_sample);
+  const uint32_t byte_rate = sample_rate * num_channels * bytes_per_sample;
+  const uint16_t block_align = num_channels * bytes_per_sample;
+  const uint32_t chunk_size = 36 + subchunk2_size;
+
+  std::vector<uint8_t> bytes;
+  bytes.reserve(44 + subchunk2_size);
+
+  append_bytes(bytes, "RIFF");
+  append_le32(bytes, chunk_size);
+  append_bytes(bytes, "WAVE");
+  append_bytes(bytes, "fmt ");
+  append_le32(bytes, 16); // PCM
+  append_le16(bytes, 1); // AudioFormat PCM
+  append_le16(bytes, num_channels);
+  append_le32(bytes, sample_rate);
+  append_le32(bytes, byte_rate);
+  append_le16(bytes, block_align);
+  append_le16(bytes, static_cast<uint16_t>(bits_per_sample));
+  append_bytes(bytes, "data");
+  append_le32(bytes, subchunk2_size);
+
+  for (int32_t sample : samples) {
+    const uint32_t encoded =
+        static_cast<uint32_t>(static_cast<int32_t>(sample));
+    for (size_t byte_idx = 0; byte_idx < bytes_per_sample; ++byte_idx) {
+      bytes.push_back(static_cast<uint8_t>((encoded >> (8 * byte_idx)) & 0xFF));
+    }
+  }
+
+  return bytes;
+}
+
+} // namespace
+
+TEST_F(WavLoaderTest, LoadHeaderParsesPcmMetadata) {
+  const std::vector<uint8_t> wav_bytes =
+      make_pcm_wav_bytes(16, {0, 32767, -32768});
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::unique_ptr<WavHeader> header = load_wav_header(file.path());
+  ASSERT_NE(header, nullptr);
+
+  EXPECT_EQ(header->AudioFormat, 1);
+  EXPECT_EQ(header->NumOfChan, 1);
+  EXPECT_EQ(header->SamplesPerSec, 16000);
+  EXPECT_EQ(header->bitsPerSample, 16);
+  EXPECT_EQ(header->blockAlign, 2);
+  EXPECT_EQ(header->bytesPerSec, 32000);
+  EXPECT_EQ(header->dataOffset, 44);
+  EXPECT_EQ(header->Subchunk2Size, 6);
+}
+
+TEST_F(WavLoaderTest, LoadAudioData16BitNormalizesSamples) {
+  const std::vector<int32_t> samples = {0, 32767, -32768};
+  const std::vector<uint8_t> wav_bytes = make_pcm_wav_bytes(16, samples);
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::vector<float> audio = load_wav_audio_data(file.path());
+  ASSERT_EQ(audio.size(), samples.size());
+
+  EXPECT_NEAR(audio[0], 0.0f, 1e-6f);
+  EXPECT_NEAR(audio[1], 32767.0f * kOneOverShortMax, 1e-6f);
+  EXPECT_NEAR(audio[2], -32768.0f * kOneOverShortMax, 1e-6f);
+}
+
+TEST_F(WavLoaderTest, LoadAudioData32BitNormalizesSamples) {
+  const std::vector<int32_t> samples = {
+      0,
+      std::numeric_limits<int32_t>::max(),
+      std::numeric_limits<int32_t>::min()};
+  const std::vector<uint8_t> wav_bytes = make_pcm_wav_bytes(32, samples);
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::vector<float> audio = load_wav_audio_data(file.path());
+  ASSERT_EQ(audio.size(), samples.size());
+
+  EXPECT_NEAR(audio[0], 0.0f, 1e-8f);
+  EXPECT_NEAR(
+      audio[1],
+      static_cast<float>(static_cast<double>(samples[1]) * kOneOverIntMax),
+      1e-6f);
+  EXPECT_NEAR(
+      audio[2],
+      static_cast<float>(static_cast<double>(samples[2]) * kOneOverIntMax),
+      1e-6f);
+}
+
+TEST_F(WavLoaderTest, LoadHeaderReturnsNullWhenMagicMissing) {
+  const std::string bogus_contents = "not a wav file";
+  TempFile file(bogus_contents);
+
+  std::unique_ptr<WavHeader> header = load_wav_header(file.path());
+  EXPECT_EQ(header, nullptr);
+}
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index 8fb245107ab..ec08ecfb647 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -141,6 +141,31 @@ inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
   }
 }
 
+/**
+ * Helper function to convert a float tensor to bfloat16.
+ * Creates a new tensor with bfloat16 dtype and copies/converts the data.
+ */
+inline ::executorch::runtime::Result<::executorch::extension::TensorPtr>
+convert_to_bfloat16(const ::executorch::extension::TensorPtr& src_tensor) {
+  ET_CHECK_OR_RETURN_ERROR(
+      src_tensor->scalar_type() == ::executorch::aten::ScalarType::Float,
+      InvalidArgument,
+      "BFloat16 conversion only supported from Float source data");
+
+  const auto num_elements = static_cast<size_t>(src_tensor->numel());
+  const float* float_data = src_tensor->const_data_ptr<float>();
+
+  auto bf16_tensor = ::executorch::extension::empty_like(
+      src_tensor, ::executorch::aten::ScalarType::BFloat16);
+  auto* bf16_data =
+      bf16_tensor->mutable_data_ptr<::executorch::aten::BFloat16>();
+  for (size_t i = 0; i < num_elements; ++i) {
+    bf16_data[i] = ::executorch::aten::BFloat16(float_data[i]);
+  }
+
+  return bf16_tensor;
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch
diff --git a/extension/llm/runner/wav_loader.h b/extension/llm/runner/wav_loader.h
new file mode 100644
index 00000000000..f49a4d1723e
--- /dev/null
+++ b/extension/llm/runner/wav_loader.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple WAV file loader.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::extension::llm {
+
+constexpr float kOneOverIntMax = 1 / static_cast<float>(INT32_MAX);
+constexpr float kOneOverShortMax = 1 / static_cast<float>(INT16_MAX);
+
+struct WavHeader {
+  /* RIFF Chunk Descriptor */
+  uint8_t RIFF[4];
+  uint32_t ChunkSize;
+  uint8_t WAVE[4];
+  /* "fmt" sub-chunk */
+  uint8_t fmt[4];
+  uint32_t Subchunk1Size;
+  uint16_t AudioFormat;
+  uint16_t NumOfChan;
+  uint32_t SamplesPerSec;
+  uint32_t bytesPerSec;
+  uint16_t blockAlign;
+  uint16_t bitsPerSample;
+  /* "data" sub-chunk */
+  uint32_t dataOffset;
+  uint32_t Subchunk2Size;
+};
+
+inline std::unique_ptr<WavHeader> load_wav_header(const std::string& fp) {
+  std::ifstream file(fp, std::ios::binary);
+  if (!file.is_open()) {
+    ET_CHECK_MSG(false, "Failed to open WAV file: %s", fp.c_str());
+  }
+
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(file_size);
+  file.read(buffer.data(), file_size);
+  file.close();
+
+  const char* data = buffer.data();
+  size_t data_size = buffer.size();
+
+  bool has_riff = false;
+  bool has_wave = false;
+
+  if (data_size >= 4 && std::memcmp(data, "RIFF", 4) == 0) {
+    has_riff = true;
+  }
+
+  if (data_size >= 12 && std::memcmp(data + 8, "WAVE", 4) == 0) {
+    has_wave = true;
+  }
+
+  bool is_wav_file = has_riff && has_wave;
+  std::unique_ptr<WavHeader> header;
+
+  if (is_wav_file) {
+    header = std::make_unique<WavHeader>();
+    size_t default_header_size = sizeof(WavHeader);
+
+    size_t data_offset = 0;
+    for (size_t i = 0; i + 4 < data_size; i++) {
+      if (std::memcmp(data + i, "data", 4) == 0) {
+        data_offset = i;
+        break;
+      }
+    }
+
+    if (data_size >= default_header_size) {
+      std::memcpy(
+          reinterpret_cast<char*>(header.get()), data, default_header_size);
+
+      ET_LOG(Info, "WAV header detected, getting raw audio data.");
+      ET_LOG(
+          Info,
+          "RIFF Header: %c%c%c%c",
+          header->RIFF[0],
+          header->RIFF[1],
+          header->RIFF[2],
+          header->RIFF[3]);
+      ET_LOG(Info, "Chunk Size: %d", header->ChunkSize);
+      ET_LOG(
+          Info,
+          "WAVE Header: %c%c%c%c",
+          header->WAVE[0],
+          header->WAVE[1],
+          header->WAVE[2],
+          header->WAVE[3]);
+      ET_LOG(
+          Info,
+          "Format Header: %c%c%c%c",
+          header->fmt[0],
+          header->fmt[1],
+          header->fmt[2],
+          header->fmt[3]);
+      ET_LOG(Info, "Format Chunk Size: %d", header->Subchunk1Size);
+      ET_LOG(Info, "Audio Format: %d", header->AudioFormat);
+      ET_LOG(Info, "Number of Channels: %d", header->NumOfChan);
+      ET_LOG(Info, "Sample Rate: %d", header->SamplesPerSec);
+      ET_LOG(Info, "Byte Rate: %d", header->bytesPerSec);
+      ET_LOG(Info, "Block Align: %d", header->blockAlign);
+      ET_LOG(Info, "Bits per Sample: %d", header->bitsPerSample);
+
+      if (data_offset != 0) {
+        header->Subchunk2Size =
+            *reinterpret_cast<const int32_t*>(data + data_offset + 4);
+        ET_LOG(Info, "Subchunk2Size: %d", header->Subchunk2Size);
+        header->dataOffset = static_cast<uint32_t>(data_offset + 8);
+      } else {
+        ET_LOG(
+            Error,
+            "WAV file structure is invalid, missing Subchunk2ID 'data' field.");
+        throw std::runtime_error("Invalid WAV file structure");
+      }
+    } else {
+      ET_CHECK_MSG(
+          false,
+          "WAV header detected but file is too small to contain a complete header");
+    }
+  }
+
+  return header;
+}
+
+inline std::vector<float> load_wav_audio_data(const std::string& fp) {
+  std::ifstream file(fp, std::ios::binary);
+  if (!file.is_open()) {
+    ET_CHECK_MSG(false, "Failed to open WAV file: %s", fp.c_str());
+  }
+
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(file_size);
+  file.read(buffer.data(), file_size);
+  file.close();
+
+  auto header = load_wav_header(fp);
+
+  if (header.get() == nullptr) {
+    ET_CHECK_MSG(false, "WAV header not detected in file: %s", fp.c_str());
+  }
+
+  const char* data = buffer.data();
+  size_t data_offset = header->dataOffset;
+  size_t data_size = header->Subchunk2Size;
+  int bits_per_sample = header->bitsPerSample;
+
+  std::vector<float> audio_data;
+
+  if (bits_per_sample == 32) {
+    size_t num_samples = data_size / 4;
+    audio_data.resize(num_samples);
+    const int32_t* input_buffer =
+        reinterpret_cast<const int32_t*>(data + data_offset);
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      audio_data[i] = static_cast<float>(
+          static_cast<double>(input_buffer[i]) * kOneOverIntMax);
+    }
+  } else if (bits_per_sample == 16) {
+    size_t num_samples = data_size / 2;
+    audio_data.resize(num_samples);
+    const int16_t* input_buffer =
+        reinterpret_cast<const int16_t*>(data + data_offset);
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      audio_data[i] = static_cast<float>(
+          static_cast<double>(input_buffer[i]) * kOneOverShortMax);
+    }
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unsupported bits per sample: %d. Only support 32 and 16.",
+        bits_per_sample);
+  }
+
+  ET_LOG(
+      Info,
+      "Loaded %zu audio samples from WAV file: %s",
+      audio_data.size(),
+      fp.c_str());
+
+  return audio_data;
+}
+
+} // namespace executorch::extension::llm
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index b0076444dec..d710a0cf10c 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit b0076444decffb88166452e26ba688233b905647
+Subproject commit d710a0cf10cfa8cb7ffda33c4e61af63119bc95f
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index 5f114f1befa..4e1c3f160bd 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -29,34 +29,39 @@ else()
 endif()
 target_link_libraries(
   extension_module PRIVATE executorch_core extension_data_loader
-                           extension_flat_tensor
+                           extension_flat_tensor extension_named_data_map
 )
 target_include_directories(
   extension_module PUBLIC ${_common_include_directories}
 )
 target_compile_options(
-  extension_module PUBLIC -Wno-deprecated-declarations -fPIC
+  extension_module
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
 )
 
 # Module extension built as a static library. TODO(gjcomer) Remove this target
 # after cleaning up CMake targets.
 add_library(extension_module_static STATIC ${_extension_module__srcs})
 target_link_libraries(
-  extension_module_static PRIVATE executorch_core extension_data_loader
-                                  extension_flat_tensor
+  extension_module_static
+  PRIVATE executorch_core extension_data_loader extension_flat_tensor
+          extension_named_data_map
 )
 target_include_directories(
   extension_module_static PUBLIC ${_common_include_directories}
 )
 target_compile_options(
-  extension_module_static PUBLIC -Wno-deprecated-declarations -fPIC
+  extension_module_static
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
 )
 
 # Install libraries
 install(
   TARGETS extension_module extension_module_static
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 4b1c30ae6b5..9de77bcbc79 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -12,6 +12,7 @@
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
+#include <executorch/extension/named_data_map/merged_data_map.h>
 #include <executorch/runtime/platform/runtime.h>
 
 /**
@@ -38,6 +39,7 @@ namespace executorch {
 namespace extension {
 namespace ET_MODULE_NAMESPACE {
 
+using ET_MERGED_DATA_MAP_NAMESPACE::MergedDataMap;
 using ET_RUNTIME_NAMESPACE::MethodMeta;
 using ET_RUNTIME_NAMESPACE::Program;
 
@@ -155,10 +157,6 @@ runtime::Error Module::load(const Program::Verification verification) {
       data_loader_ = ET_UNWRAP(make_data_loader(file_path_, load_mode_));
     }
     if (data_files_.size() > 0) {
-      ET_CHECK_OR_RETURN_ERROR(
-          data_files_.size() == 1,
-          NotImplemented,
-          "Multiple named data map paths are not supported yet.");
       for (const auto& data_file : data_files_) {
         data_map_loaders_.push_back(
             ET_UNWRAP(make_data_loader(data_file, load_mode_)));
@@ -166,13 +164,20 @@ runtime::Error Module::load(const Program::Verification verification) {
     }
 
     if (data_map_loaders_.size() > 0) {
-      ET_CHECK_OR_RETURN_ERROR(
-          data_map_loaders_.size() == 1 && merged_data_map_ == nullptr,
-          NotImplemented,
-          "Multiple named data map loaders are not supported yet.");
-      // TODO(lfq): support multiple named data map loaders.
-      merged_data_map_ =
-          ET_UNWRAP_UNIQUE(FlatTensorDataMap::load(data_map_loaders_[0].get()));
+      for (auto i = 0; i < data_map_loaders_.size(); ++i) {
+        named_data_maps_.push_back(ET_UNWRAP_UNIQUE(
+            FlatTensorDataMap::load(data_map_loaders_[i].get())));
+      }
+
+      // Extract raw pointers from unique_ptrs to pass to MergedDataMap::load()
+      std::vector<const NamedDataMap*> raw_data_maps;
+      raw_data_maps.reserve(named_data_maps_.size());
+      for (const auto& data_map : named_data_maps_) {
+        raw_data_maps.push_back(data_map.get());
+      }
+      merged_data_map_ = ET_UNWRAP_UNIQUE(
+          MergedDataMap::load(runtime::Span<const NamedDataMap*>(
+              raw_data_maps.data(), raw_data_maps.size())));
     }
 
     auto program =
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 3e449da5e14..0db909ce053 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -26,6 +26,7 @@ def define_common_targets():
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/extension/data_loader:mmap_data_loader",
                 "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix,
+                "//executorch/extension/named_data_map:merged_data_map" + aten_suffix,
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index 1c4358dd73e..54ace17557f 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -23,11 +23,14 @@ add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
   COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
           "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   COMMAND
-    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
-    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
+    "ModuleAddMul,ModuleLinear" --external-constants --outdir
+    "${CMAKE_CURRENT_BINARY_DIR}"
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
@@ -36,12 +39,16 @@ add_custom_target(
   DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 set(test_env
     "ET_MODULE_ADD_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
     "ET_MODULE_ADD_MUL_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
     "ET_MODULE_ADD_MUL_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+    "ET_MODULE_LINEAR_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 et_cxx_test(
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 6f7e8a44558..27332503cad 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -26,11 +26,15 @@ class ModuleTest : public ::testing::Test {
     model_path_ = std::getenv("ET_MODULE_ADD_PATH");
     add_mul_path_ = std::getenv("ET_MODULE_ADD_MUL_PROGRAM_PATH");
     add_mul_data_path_ = std::getenv("ET_MODULE_ADD_MUL_DATA_PATH");
+    linear_path_ = std::getenv("ET_MODULE_LINEAR_PROGRAM_PATH");
+    linear_data_path_ = std::getenv("ET_MODULE_LINEAR_DATA_PATH");
   }
 
   static inline std::string model_path_;
   static inline std::string add_mul_path_;
   static inline std::string add_mul_data_path_;
+  static inline std::string linear_path_;
+  static inline std::string linear_data_path_;
 };
 
 TEST_F(ModuleTest, TestLoad) {
@@ -532,16 +536,21 @@ TEST_F(ModuleTest, TestPTD) {
 }
 
 TEST_F(ModuleTest, TestPTD_Multiple) {
-  std::vector<std::string> data_files = {add_mul_data_path_};
-  Module module(add_mul_path_, data_files);
-
-  ASSERT_EQ(module.load_method("forward"), Error::Ok);
+  std::vector<std::string> data_files = {add_mul_data_path_, linear_data_path_};
 
+  // Create module with add mul.
+  Module module_add_mul(add_mul_path_, data_files);
+  ASSERT_EQ(module_add_mul.load_method("forward"), Error::Ok);
   auto tensor = make_tensor_ptr({2, 2}, {2.f, 3.f, 4.f, 2.f});
-  ASSERT_EQ(module.forward(tensor).error(), Error::Ok);
+  ASSERT_EQ(module_add_mul.forward(tensor).error(), Error::Ok);
 
   // Confirm that the data_file is not std::move'd away.
   ASSERT_EQ(std::strcmp(data_files[0].c_str(), add_mul_data_path_.c_str()), 0);
+  ASSERT_EQ(std::strcmp(data_files[1].c_str(), linear_data_path_.c_str()), 0);
 
-  // TODO(lfq): add test when merge capability is supported.
+  // Create module with linear.
+  Module module_linear(linear_path_, data_files);
+  ASSERT_EQ(module_linear.load_method("forward"), Error::Ok);
+  auto tensor2 = make_tensor_ptr({3}, {2.f, 3.f, 4.f});
+  ASSERT_EQ(module_linear.forward(tensor2).error(), Error::Ok);
 }
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index d1aa73f6789..da7f1cc91bd 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -19,6 +19,8 @@ def define_common_targets(is_fbcode=False):
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
             "ET_MODULE_ADD_MUL_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.pte])",
             "ET_MODULE_ADD_MUL_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.ptd])",
+            "ET_MODULE_LINEAR_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.pte])",
+            "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
             "ET_MODULE_SHARED_STATE": "$(location fbcode//executorch/test/models:exported_programs[ModuleSharedState.pte])",
         }
 
diff --git a/extension/named_data_map/CMakeLists.txt b/extension/named_data_map/CMakeLists.txt
new file mode 100644
index 00000000000..a4ad208c7e2
--- /dev/null
+++ b/extension/named_data_map/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please format this file by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_named_data_map__srcs PREPEND "${EXECUTORCH_ROOT}/")
+# Create the library
+add_library(extension_named_data_map ${_extension_named_data_map__srcs})
+
+# Link dependencies
+target_link_libraries(extension_named_data_map PUBLIC executorch_core)
+
+target_include_directories(
+  extension_named_data_map PUBLIC ${_common_include_directories}
+)
+
+target_compile_options(
+  extension_named_data_map PUBLIC ${_common_compile_options}
+)
+
+# Install libraries
+install(
+  TARGETS extension_named_data_map
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
+
+# Add tests if testing is enabled
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/named_data_map/TARGETS b/extension/named_data_map/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/named_data_map/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/named_data_map/merged_data_map.cpp b/extension/named_data_map/merged_data_map.cpp
new file mode 100644
index 00000000000..2d1bb7d6158
--- /dev/null
+++ b/extension/named_data_map/merged_data_map.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/data_loader.h>
+
+#include <unordered_map>
+#include <vector>
+
+using executorch::aten::string_view;
+using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
+using executorch::ET_RUNTIME_NAMESPACE::TensorLayout;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+namespace executorch::extension {
+namespace ET_MERGED_DATA_MAP_NAMESPACE {
+/*static*/ Result<MergedDataMap> MergedDataMap::load(
+    Span<const NamedDataMap*> named_data_maps) {
+  std::vector<const NamedDataMap*> valid_data_maps;
+  for (auto i : c10::irange(named_data_maps.size())) {
+    if (named_data_maps[i] != nullptr &&
+        named_data_maps[i]->get_num_keys().get() > 0) {
+      valid_data_maps.push_back(named_data_maps[i]);
+    }
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      !valid_data_maps.empty(),
+      InvalidArgument,
+      "No non-empty named data maps provided to merge");
+
+  // Check for duplicate keys.
+  std::unordered_map<std::string, uint32_t> key_to_map_index;
+  for (const uint32_t i : c10::irange(valid_data_maps.size())) {
+    const auto cur_map = valid_data_maps[i];
+    uint32_t num_keys = cur_map->get_num_keys().get();
+    for (auto j : c10::irange(num_keys)) {
+      const auto cur_key = cur_map->get_key(j).get();
+      const auto [it, inserted] = key_to_map_index.emplace(cur_key, i);
+      ET_CHECK_OR_RETURN_ERROR(
+          inserted,
+          InvalidArgument,
+          "Duplicate key %s in named data maps at index %u and %" PRIu32,
+          cur_key,
+          it->second,
+          i);
+    }
+  }
+  return MergedDataMap(std::move(valid_data_maps), std::move(key_to_map_index));
+}
+
+ET_NODISCARD Result<const TensorLayout> MergedDataMap::get_tensor_layout(
+    string_view key) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+
+  return named_data_maps_.at(it->second)->get_tensor_layout(key);
+}
+
+ET_NODISCARD
+Result<FreeableBuffer> MergedDataMap::get_data(string_view key) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+  return named_data_maps_.at(it->second)->get_data(key);
+}
+
+ET_NODISCARD Error MergedDataMap::load_data_into(
+    string_view key,
+    void* buffer,
+    size_t size) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+  return named_data_maps_.at(it->second)->load_data_into(key, buffer, size);
+}
+
+ET_NODISCARD Result<uint32_t> MergedDataMap::get_num_keys() const {
+  return key_to_map_index_.size();
+}
+
+ET_NODISCARD Result<const char*> MergedDataMap::get_key(uint32_t index) const {
+  uint32_t total_num_keys = get_num_keys().get();
+  ET_CHECK_OR_RETURN_ERROR(
+      index < total_num_keys,
+      InvalidArgument,
+      "Index %u out of range of size %u",
+      index,
+      total_num_keys);
+  for (auto i : c10::irange(named_data_maps_.size())) {
+    auto num_keys = named_data_maps_[i]->get_num_keys().get();
+    if (index < num_keys) {
+      return named_data_maps_[i]->get_key(index);
+    }
+    index -= num_keys;
+  }
+  // Shouldn't reach here.
+  return Error::Internal;
+}
+
+} // namespace ET_MERGED_DATA_MAP_NAMESPACE
+} // namespace executorch::extension
diff --git a/extension/named_data_map/merged_data_map.h b/extension/named_data_map/merged_data_map.h
new file mode 100644
index 00000000000..42490ec3d58
--- /dev/null
+++ b/extension/named_data_map/merged_data_map.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/named_data_map.h>
+
+#include <unordered_map>
+#include <vector>
+
+#ifdef USE_ATEN_LIB
+#define ET_MERGED_DATA_MAP_NAMESPACE merged_data_map::aten
+#else // !USE_ATEN_LIB
+#define ET_MERGED_DATA_MAP_NAMESPACE merged_data_map
+#endif // USE_ATEN_LIB
+
+namespace executorch::extension {
+
+namespace ET_MERGED_DATA_MAP_NAMESPACE {
+/**
+ * A NamedDataMap implementation that wraps other NamedDataMaps.
+ */
+class MergedDataMap final
+    : public executorch::ET_RUNTIME_NAMESPACE::NamedDataMap {
+ public:
+  /**
+   * Creates a new NamedDataMap that takes in other data maps.
+   *
+   * @param[in] data_maps vector of NamedDataMap pointers to merge.
+   * Note: the data maps must outlive the MergedDataMap instance.
+   */
+  static executorch::runtime::Result<MergedDataMap>
+  load(executorch::runtime::Span<
+       const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*> named_data_maps);
+
+  /**
+   * Retrieve the tensor_layout for the specified key.
+   *
+   * @param[in] key The name of the tensor to get metadata on.
+   *
+   * @return Error::NotFound if the key is not present.
+   */
+  ET_NODISCARD
+  executorch::runtime::Result<
+      const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
+  get_tensor_layout(executorch::aten::string_view key) const override;
+
+  /**
+   * Retrieve read-only data for the specified key.
+   *
+   * @param[in] key The name of the tensor to get data on.
+   *
+   * @return error if the key is not present or data cannot be loaded.
+   */
+  ET_NODISCARD
+  executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
+      executorch::aten::string_view key) const override;
+
+  /**
+   * Loads the data of the specified tensor into the provided buffer.
+   *
+   * @param[in] key The name of the tensor to get the data of.
+   * @param[in] buffer The buffer to load data into. Must point to at least
+   * `size` bytes of memory.
+   * @param[in] size The number of bytes to load.
+   *
+   * @returns an Error indicating if the load was successful.
+   */
+  ET_NODISCARD executorch::runtime::Error load_data_into(
+      executorch::aten::string_view key,
+      void* buffer,
+      size_t size) const override;
+
+  /**
+   * @returns The number of keys in the map.
+   */
+  ET_NODISCARD executorch::runtime::Result<uint32_t> get_num_keys()
+      const override;
+  /**
+   * @returns The key at the specified index, error if index out of bounds.
+   */
+  ET_NODISCARD executorch::runtime::Result<const char*> get_key(
+      uint32_t index) const override;
+
+  MergedDataMap(MergedDataMap&&) noexcept = default;
+
+  ~MergedDataMap() override = default;
+
+ private:
+  MergedDataMap(
+      std::vector<const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*>
+          named_data_maps,
+      std::unordered_map<std::string, uint32_t> key_to_map_index)
+      : named_data_maps_(std::move(named_data_maps)),
+        key_to_map_index_(std::move(key_to_map_index)) {}
+
+  // Not copyable or assignable.
+  MergedDataMap(const MergedDataMap& rhs) = delete;
+  MergedDataMap& operator=(MergedDataMap&& rhs) noexcept = delete;
+  MergedDataMap& operator=(const MergedDataMap& rhs) = delete;
+
+  std::vector<const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*>
+      named_data_maps_;
+
+  // Map from key to index in the named_data_maps_ vector.
+  std::unordered_map<std::string, uint32_t> key_to_map_index_;
+};
+
+} // namespace ET_MERGED_DATA_MAP_NAMESPACE
+} // namespace executorch::extension
diff --git a/extension/named_data_map/targets.bzl b/extension/named_data_map/targets.bzl
new file mode 100644
index 00000000000..0c2b2fa6d5c
--- /dev/null
+++ b/extension/named_data_map/targets.bzl
@@ -0,0 +1,21 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
+
+def define_common_targets():
+    for aten_mode in get_aten_mode_options():
+        aten_suffix = "_aten" if aten_mode else ""
+        runtime.cxx_library(
+            name = "merged_data_map" + aten_suffix,
+            srcs = [
+                "merged_data_map.cpp",
+            ],
+            exported_headers = [
+                "merged_data_map.h",
+            ],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            deps = [
+                "//executorch/runtime/core:named_data_map" + aten_suffix,
+                "//executorch/runtime/core:core",
+            ],
+        )
diff --git a/extension/named_data_map/test/CMakeLists.txt b/extension/named_data_map/test/CMakeLists.txt
new file mode 100644
index 00000000000..7fbcb7e5989
--- /dev/null
+++ b/extension/named_data_map/test/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+add_custom_command(
+  OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+  COMMAND
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
+    "ModuleAddMul,ModuleLinear" --external-constants --outdir
+    "${CMAKE_CURRENT_BINARY_DIR}"
+  WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+)
+
+add_custom_target(
+  extension_named_data_map_test_resources
+  DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+)
+
+set(test_env
+    "ET_MODULE_ADD_MUL_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+)
+
+set(_test_srcs merged_data_map_test.cpp)
+
+et_cxx_test(
+  extension_named_data_map_test
+  SOURCES
+  ${_test_srcs}
+  EXTRA_LIBS
+  extension_named_data_map
+  extension_flat_tensor
+  extension_data_loader
+)
+
+add_dependencies(
+  extension_named_data_map_test extension_named_data_map
+  extension_named_data_map_test_resources
+)
+set_property(
+  TEST extension_named_data_map_test PROPERTY ENVIRONMENT ${test_env}
+)
diff --git a/extension/named_data_map/test/TARGETS b/extension/named_data_map/test/TARGETS
new file mode 100644
index 00000000000..883ab644309
--- /dev/null
+++ b/extension/named_data_map/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets(is_fbcode=True)
diff --git a/extension/named_data_map/test/merged_data_map_test.cpp b/extension/named_data_map/test/merged_data_map_test.cpp
new file mode 100644
index 00000000000..ccfaaa0ec0e
--- /dev/null
+++ b/extension/named_data_map/test/merged_data_map_test.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
+#include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+using namespace ::testing;
+using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
+using executorch::extension::merged_data_map::MergedDataMap;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::TensorLayout;
+
+class MergedDataMapTest : public ::testing::Test {
+ protected:
+  void load_flat_tensor_data_map(const char* path, const char* module_name) {
+    Result<FileDataLoader> loader = FileDataLoader::from(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    loaders_.emplace(
+        module_name, std::make_unique<FileDataLoader>(std::move(loader.get())));
+
+    Result<FlatTensorDataMap> data_map =
+        FlatTensorDataMap::load(loaders_[module_name].get());
+    EXPECT_EQ(data_map.error(), Error::Ok);
+
+    data_maps_.emplace(
+        module_name,
+        std::make_unique<FlatTensorDataMap>(std::move(data_map.get())));
+  }
+
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Load FlatTensor data maps.
+    // The eager addmul and linear models are defined at:
+    // //executorch/test/models/export_program.py
+    load_flat_tensor_data_map(
+        std::getenv("ET_MODULE_ADD_MUL_DATA_PATH"), "addmul");
+    load_flat_tensor_data_map(
+        std::getenv("ET_MODULE_LINEAR_DATA_PATH"), "linear");
+  }
+
+ private:
+  // Must outlive data_maps_, but tests shouldn't need to touch it.
+  std::unordered_map<std::string, std::unique_ptr<FileDataLoader>> loaders_;
+
+ protected:
+  std::unordered_map<std::string, std::unique_ptr<NamedDataMap>> data_maps_;
+};
+
+// Check that two tensor layouts are equivalent.
+void check_tensor_layout(TensorLayout& layout1, TensorLayout& layout2) {
+  EXPECT_EQ(layout1.scalar_type(), layout2.scalar_type());
+  EXPECT_EQ(layout1.nbytes(), layout2.nbytes());
+  EXPECT_EQ(layout1.sizes().size(), layout2.sizes().size());
+  for (auto i : c10::irange(layout1.sizes().size())) {
+    EXPECT_EQ(layout1.sizes()[i], layout2.sizes()[i]);
+  }
+  EXPECT_EQ(layout1.dim_order().size(), layout2.dim_order().size());
+  for (auto i : c10::irange(layout1.dim_order().size())) {
+    EXPECT_EQ(layout1.dim_order()[i], layout2.dim_order()[i]);
+  }
+}
+
+// Given that ndm is part of merged, check that all the API calls on ndm produce
+// the same results as merged.
+void compare_ndm_api_calls(
+    const NamedDataMap* ndm,
+    const NamedDataMap* merged) {
+  uint32_t num_keys = ndm->get_num_keys().get();
+  for (auto i : c10::irange(num_keys)) {
+    auto key = ndm->get_key(i).get();
+
+    // Compare get_tensor_layout.
+    auto ndm_meta = ndm->get_tensor_layout(key).get();
+    auto merged_meta = merged->get_tensor_layout(key).get();
+    check_tensor_layout(ndm_meta, merged_meta);
+
+    // Compare get_data.
+    auto ndm_data = ndm->get_data(key);
+    auto merged_data = merged->get_data(key);
+    EXPECT_EQ(ndm_data.get().size(), merged_data.get().size());
+    for (auto j : c10::irange(ndm_meta.nbytes())) {
+      EXPECT_EQ(
+          ((uint8_t*)ndm_data.get().data())[j],
+          ((uint8_t*)merged_data.get().data())[j]);
+    }
+    ndm_data->Free();
+    merged_data->Free();
+
+    // Compare load_into.
+    auto nbytes = ndm_meta.nbytes();
+    auto ndm_buffer = std::make_unique<uint8_t[]>(nbytes);
+    auto ndm_load_into = ndm->load_data_into(key, ndm_buffer.get(), nbytes);
+    EXPECT_EQ(ndm_load_into, Error::Ok);
+    auto merged_buffer = std::make_unique<uint8_t[]>(nbytes);
+    auto merged_load_into =
+        merged->load_data_into(key, merged_buffer.get(), nbytes);
+    EXPECT_EQ(merged_load_into, Error::Ok);
+    for (auto j : c10::irange(ndm_meta.nbytes())) {
+      EXPECT_EQ(
+          ((uint8_t*)merged_buffer.get())[j],
+          ((uint8_t*)merged_buffer.get())[j]);
+    }
+  }
+}
+
+TEST_F(MergedDataMapTest, LoadNullDataMap) {
+  Result<MergedDataMap> merged_map = MergedDataMap::load({nullptr, nullptr});
+  EXPECT_EQ(merged_map.error(), Error::InvalidArgument);
+}
+
+TEST_F(MergedDataMapTest, LoadSingleDataMap) {
+  std::vector<const NamedDataMap*> ndms = {data_maps_["addmul"].get(), nullptr};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::Ok);
+
+  // Num keys.
+  EXPECT_EQ(
+      merged_map->get_num_keys().get(),
+      data_maps_["addmul"]->get_num_keys().get());
+
+  // API calls produce equivalent results.
+  compare_ndm_api_calls(data_maps_["addmul"].get(), &merged_map.get());
+}
+
+TEST_F(MergedDataMapTest, LoadDuplicateDataMapsFail) {
+  std::vector<const NamedDataMap*> ndms = {
+      data_maps_["addmul"].get(), data_maps_["addmul"].get()};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::InvalidArgument);
+}
+
+TEST_F(MergedDataMapTest, CheckDataMapContents) {
+  std::vector<const NamedDataMap*> ndms = {
+      data_maps_["addmul"].get(), data_maps_["linear"].get()};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::Ok);
+
+  // Num keys.
+  size_t addmul_num_keys = data_maps_["addmul"]->get_num_keys().get();
+  size_t linear_num_keys = data_maps_["linear"]->get_num_keys().get();
+  EXPECT_EQ(
+      merged_map->get_num_keys().get(), addmul_num_keys + linear_num_keys);
+
+  // API calls produce equivalent results.
+  compare_ndm_api_calls(data_maps_["addmul"].get(), &merged_map.get());
+  compare_ndm_api_calls(data_maps_["linear"].get(), &merged_map.get());
+}
diff --git a/extension/named_data_map/test/targets.bzl b/extension/named_data_map/test/targets.bzl
new file mode 100644
index 00000000000..516abb8d45e
--- /dev/null
+++ b/extension/named_data_map/test/targets.bzl
@@ -0,0 +1,26 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets(is_fbcode=False):
+    if not runtime.is_oss and is_fbcode:
+        modules_env = {
+            # The tests use this var to find the program file to load. This uses
+            # an fbcode target path because the authoring/export tools
+            # intentionally don't work in xplat (since they're host-only tools).
+            "ET_MODULE_ADD_MUL_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.ptd])",
+            "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
+        }
+
+        runtime.cxx_test(
+            name = "merged_data_map_test",
+            srcs = [
+                "merged_data_map_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
+                "//executorch/extension/named_data_map:merged_data_map",
+                "//executorch/runtime/core:named_data_map",
+                "//executorch/runtime/core/exec_aten:lib",
+            ],
+            env = modules_env,
+        )
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index a896a4bde36..c3cd4ed0b47 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -158,6 +158,24 @@ void setup_output_storage(
   }
 }
 
+inline std::unique_ptr<DataLoader> loader_from_buffer(
+    const void* ptr,
+    size_t ptr_len) {
+  return std::make_unique<BufferDataLoader>(ptr, ptr_len);
+}
+
+inline std::unique_ptr<DataLoader> loader_from_file(const std::string& path) {
+  Result<MmapDataLoader> res = MmapDataLoader::from(
+      path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
+  THROW_IF_ERROR(
+      res.error(),
+      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
+      path.c_str(),
+      static_cast<uint32_t>(res.error()));
+
+  return std::make_unique<MmapDataLoader>(std::move(res.get()));
+}
+
 inline std::unique_ptr<Module> load_module_from_buffer(
     const void* ptr,
     size_t ptr_len,
@@ -166,11 +184,11 @@ inline std::unique_ptr<Module> load_module_from_buffer(
     std::unique_ptr<runtime::EventTracer> event_tracer,
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_buffer");
-  auto loader = std::make_unique<BufferDataLoader>(ptr, ptr_len);
+  auto loader = loader_from_buffer(ptr, ptr_len);
 
   if (data_map_ptr.has_value() && data_map_len.has_value()) {
-    auto data_map_loader = std::make_unique<BufferDataLoader>(
-        data_map_ptr.value(), data_map_len.value());
+    auto data_map_loader =
+        loader_from_buffer(data_map_ptr.value(), data_map_len.value());
     return std::make_unique<Module>(
         std::move(loader),
         nullptr, // memory_allocator
@@ -194,27 +212,9 @@ inline std::unique_ptr<Module> load_module_from_file(
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_file");
 
-  Result<MmapDataLoader> program_loader_res = MmapDataLoader::from(
-      program_path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-  THROW_IF_ERROR(
-      program_loader_res.error(),
-      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-      program_path.c_str(),
-      static_cast<uint32_t>(program_loader_res.error()));
-  auto program_loader =
-      std::make_unique<MmapDataLoader>(std::move(program_loader_res.get()));
-
+  auto program_loader = loader_from_file(program_path);
   if (data_map_path.has_value()) {
-    Result<MmapDataLoader> data_map_loader_res = MmapDataLoader::from(
-        data_map_path->c_str(),
-        MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-    THROW_IF_ERROR(
-        data_map_loader_res.error(),
-        "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-        data_map_path->c_str(),
-        static_cast<uint32_t>(data_map_loader_res.error()));
-    auto data_map_loader =
-        std::make_unique<MmapDataLoader>(std::move(data_map_loader_res.get()));
+    auto data_map_loader = loader_from_file(data_map_path.value());
     return std::make_unique<Module>(
         std::move(program_loader),
         nullptr, // memory_allocator
@@ -230,6 +230,22 @@ inline std::unique_ptr<Module> load_module_from_file(
       nullptr); // data_map_loader
 }
 
+inline std::unique_ptr<Module> load_module_from_buffer_with_data_file(
+    const void* ptr,
+    size_t ptr_len,
+    const std::string& data_map_path,
+    std::unique_ptr<runtime::EventTracer> event_tracer,
+    Program::Verification program_verification) {
+  auto program_loader = loader_from_buffer(ptr, ptr_len);
+  auto data_loader = loader_from_file(data_map_path);
+  return std::make_unique<Module>(
+      std::move(program_loader),
+      nullptr, // memory_allocator
+      nullptr, // temp_allocator
+      std::move(event_tracer), // event_tracer
+      std::move(data_loader));
+}
+
 inline py::list get_outputs_as_py_list(
     const std::vector<EValue>& outputs,
     bool clone_outputs = true) {
@@ -555,6 +571,22 @@ struct PyModule final {
             setup_event_tracer(enable_etdump, debug_buffer_size),
             program_verification)) {}
 
+  explicit PyModule(
+      const void* ptr,
+      size_t ptr_len,
+      const std::string& data_path,
+      bool enable_etdump,
+      size_t debug_buffer_size = 0,
+      Program::Verification program_verification =
+          Program::Verification::InternalConsistency)
+      : debug_buffer_size_(debug_buffer_size),
+        module_(load_module_from_buffer_with_data_file(
+            ptr,
+            ptr_len,
+            data_path,
+            setup_event_tracer(enable_etdump, debug_buffer_size),
+            program_verification)) {}
+
   explicit PyModule(
       const std::string& program_path,
       std::optional<const std::string>& data_path,
@@ -605,6 +637,7 @@ struct PyModule final {
         program_verification);
   }
 
+  // Load with data as a buffer.
   static std::unique_ptr<PyModule> load_from_bundled_program(
       PyBundledModule& m,
       std::optional<const py::bytes> data_map_buffer,
@@ -628,6 +661,21 @@ struct PyModule final {
         Program::Verification::InternalConsistency);
   }
 
+  // Load with data as a file.
+  static std::unique_ptr<PyModule> load_from_bundled_program(
+      PyBundledModule& m,
+      const std::string& data_path,
+      bool enable_etdump,
+      size_t debug_buffer_size = 0) {
+    return std::make_unique<PyModule>(
+        m.get_program_ptr(),
+        m.get_program_len(),
+        data_path,
+        enable_etdump,
+        debug_buffer_size,
+        Program::Verification::InternalConsistency);
+  }
+
   py::list run_method(
       const std::string& method_name,
       const py::sequence& inputs,
@@ -900,24 +948,6 @@ struct PyModule final {
   }
 };
 
-inline std::unique_ptr<DataLoader> loader_from_buffer(
-    const void* ptr,
-    size_t ptr_len) {
-  return std::make_unique<BufferDataLoader>(ptr, ptr_len);
-}
-
-inline std::unique_ptr<DataLoader> loader_from_file(const std::string& path) {
-  Result<MmapDataLoader> res = MmapDataLoader::from(
-      path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-  THROW_IF_ERROR(
-      res.error(),
-      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-      path.c_str(),
-      static_cast<uint32_t>(res.error()));
-
-  return std::make_unique<MmapDataLoader>(std::move(res.get()));
-}
-
 inline std::shared_ptr<ProgramState> load_program(
     std::unique_ptr<DataLoader> loader,
     Program::Verification program_verification) {
@@ -1474,12 +1504,25 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
       call_guard);
   m.def(
       "_load_for_executorch_from_bundled_program",
-      &PyModule::load_from_bundled_program,
+      py::overload_cast<
+          PyBundledModule&,
+          std::optional<const py::bytes>,
+          bool,
+          size_t>(&PyModule::load_from_bundled_program),
       py::arg("ptr"),
       py::arg("data_map_buffer") = std::nullopt,
       py::arg("enable_etdump") = false,
       py::arg("debug_buffer_size") = 0,
       call_guard);
+  m.def(
+      "_load_for_executorch_from_bundled_program",
+      py::overload_cast<PyBundledModule&, const std::string&, bool, size_t>(
+          &PyModule::load_from_bundled_program),
+      py::arg("ptr"),
+      py::arg("data_path"),
+      py::arg("enable_etdump") = false,
+      py::arg("debug_buffer_size") = 0,
+      call_guard);
   m.def(
       "_load_bundled_program_from_buffer",
       &PyBundledModule::load_from_buffer,
diff --git a/extension/pybindings/test/test_pybindings.py b/extension/pybindings/test/test_pybindings.py
index 02ad6b5e327..ec45428c7d7 100644
--- a/extension/pybindings/test/test_pybindings.py
+++ b/extension/pybindings/test/test_pybindings.py
@@ -701,7 +701,7 @@ def test_program_data_separation(self) -> None:
         bundled_buffer = serialize_from_bundled_program_to_flatbuffer(bundled_program)
         bundled_module = self.runtime._load_bundled_program_from_buffer(bundled_buffer)
 
-        # Load module from bundled program with external data
+        # Load module from bundled program with external data buffer
         executorch_module_bundled = (
             self.runtime._load_for_executorch_from_bundled_program(
                 bundled_module, data_buffer
@@ -710,6 +710,23 @@ def test_program_data_separation(self) -> None:
         executorch_output_bundled = executorch_module_bundled.forward(inputs)[0]
         self.assertTrue(torch.allclose(expected, executorch_output_bundled))
 
+        # Load module from bundled program with external data file
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ptd_file = os.path.join(tmpdir, "linear.ptd")
+            with open(ptd_file, "wb") as ptd:
+                ptd.write(data_buffer)
+            executorch_module_bundled_data_file = (
+                self.runtime._load_for_executorch_from_bundled_program(
+                    bundled_module, ptd_file
+                )
+            )
+            executorch_output_bundled_data_file = (
+                executorch_module_bundled_data_file.forward(inputs)[0]
+            )
+            self.assertTrue(
+                torch.allclose(expected, executorch_output_bundled_data_file)
+            )
+
         # Test 6: Bundled program without external data should fail
         executorch_module_bundled_no_data = (
             self.runtime._load_for_executorch_from_bundled_program(bundled_module)
diff --git a/extension/runner_util/CMakeLists.txt b/extension/runner_util/CMakeLists.txt
index 0bf8f33a656..75fa11c0493 100644
--- a/extension/runner_util/CMakeLists.txt
+++ b/extension/runner_util/CMakeLists.txt
@@ -29,7 +29,7 @@ target_compile_options(extension_runner_util PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_runner_util
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/tensor/CMakeLists.txt b/extension/tensor/CMakeLists.txt
index 0e409c3bfb3..2a8d9b17916 100644
--- a/extension/tensor/CMakeLists.txt
+++ b/extension/tensor/CMakeLists.txt
@@ -28,7 +28,7 @@ target_compile_options(extension_tensor PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_tensor
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 4753ec296da..d8fad857cd2 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -123,13 +123,14 @@ inline TensorPtr make_tensor_ptr(
       }
     } ctx;
 
-    ET_SWITCH_REALHBBF16_TYPES(type, ctx, "make_tensor_ptr", CTYPE, [&] {
-      std::transform(
-          data.begin(),
-          data.end(),
-          reinterpret_cast<CTYPE*>(casted_data.data()),
-          [](const T& val) { return static_cast<CTYPE>(val); });
-    });
+    ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
+        type, ctx, "make_tensor_ptr", CTYPE, [&] {
+          std::transform(
+              data.begin(),
+              data.end(),
+              reinterpret_cast<CTYPE*>(casted_data.data()),
+              [](const T& val) { return static_cast<CTYPE>(val); });
+        });
     const auto raw_data_ptr = casted_data.data();
     auto data_ptr =
         std::make_shared<std::vector<uint8_t>>(std::move(casted_data));
@@ -272,7 +273,8 @@ inline TensorPtr make_tensor_ptr(
  */
 template <typename T>
 inline TensorPtr make_tensor_ptr(T value) {
-  return make_tensor_ptr({}, std::vector<T>{value});
+  return make_tensor_ptr(
+      std::vector<executorch::aten::SizesType>{}, std::vector<T>{value});
 }
 
 /**
@@ -323,32 +325,87 @@ inline TensorPtr make_tensor_ptr(
 }
 
 /**
- * Creates a TensorPtr to manage a new Tensor with the same properties
- * as the given Tensor, sharing the same data without owning it.
+ * Creates a TensorPtr to manage a new Tensor that aliases the given Tensor's
+ * storage, with optional metadata overrides. Shape dynamism is inherited from
+ * the source tensor.
+ *
+ * If an override is provided (non-empty), it is passed as-is. If an override is
+ * empty, the corresponding metadata is reused from the source tensor when it
+ * fits; otherwise it is left empty for the core factory to derive a valid
+ * configuration. If `dim_order` is empty but `strides` is provided, `dim_order`
+ * is left empty so the core may infer it from the provided strides.
  *
- * @param tensor The Tensor whose properties are used to create a new TensorPtr.
- * @return A new TensorPtr managing a Tensor with the same properties as the
- * original.
+ * @param tensor The source tensor to alias.
+ * @param sizes Optional sizes override.
+ * @param dim_order Optional dimension order override.
+ * @param strides Optional strides override.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * original Tensor.
+ * @return A TensorPtr aliasing the same storage with requested metadata.
  */
-inline TensorPtr make_tensor_ptr(const executorch::aten::Tensor& tensor) {
+inline TensorPtr make_tensor_ptr(
+    const executorch::aten::Tensor& tensor,
+    std::vector<executorch::aten::SizesType> sizes = {},
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {},
+    std::function<void(void*)> deleter = nullptr) {
+  if (sizes.empty()) {
+    sizes.assign(tensor.sizes().begin(), tensor.sizes().end());
+  }
+  const auto same_rank = sizes.size() == static_cast<size_t>(tensor.dim());
+  const auto same_shape = same_rank &&
+      std::equal(sizes.begin(), sizes.end(), tensor.sizes().begin());
+  const auto element_count =
+      executorch::aten::compute_numel(sizes.data(), sizes.size());
+  const auto parent_element_count = tensor.numel();
+  ET_CHECK_MSG(
+      element_count <= parent_element_count,
+      "Requested view has %zd elements, but source tensor only has %zd.",
+      static_cast<ssize_t>(element_count),
+      static_cast<ssize_t>(parent_element_count));
+#ifndef USE_ATEN_LIB
+  if (dim_order.empty() && strides.empty() && same_rank) {
+    dim_order.assign(tensor.dim_order().begin(), tensor.dim_order().end());
+  }
+#endif // USE_ATEN_LIB
+  if (strides.empty() && dim_order.empty() && same_shape) {
+    strides.assign(tensor.strides().begin(), tensor.strides().end());
+  }
   return make_tensor_ptr(
-      std::vector<executorch::aten::SizesType>(
-          tensor.sizes().begin(), tensor.sizes().end()),
+      std::move(sizes),
       tensor.mutable_data_ptr(),
-#ifndef USE_ATEN_LIB
-      std::vector<executorch::aten::DimOrderType>(
-          tensor.dim_order().begin(), tensor.dim_order().end()),
-      std::vector<executorch::aten::StridesType>(
-          tensor.strides().begin(), tensor.strides().end()),
+      std::move(dim_order),
+      std::move(strides),
       tensor.scalar_type(),
-      tensor.shape_dynamism()
+#ifndef USE_ATEN_LIB
+      tensor.shape_dynamism(),
 #else // USE_ATEN_LIB
-      {},
-      std::vector<executorch::aten::StridesType>(
-          tensor.strides().begin(), tensor.strides().end()),
-      tensor.scalar_type()
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
 #endif // USE_ATEN_LIB
-  );
+      std::move(deleter));
+}
+
+/**
+ * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
+ * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed.
+ *
+ * @param tensor_ptr The source tensor pointer to alias.
+ * @param sizes Optional sizes override.
+ * @param dim_order Optional dimension order override.
+ * @param strides Optional strides override.
+ * @return A TensorPtr aliasing the same storage with requested metadata.
+ */
+inline TensorPtr make_tensor_ptr(
+    const TensorPtr& tensor_ptr,
+    std::vector<executorch::aten::SizesType> sizes = {},
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {}) {
+  return make_tensor_ptr(
+      *tensor_ptr,
+      std::move(sizes),
+      std::move(dim_order),
+      std::move(strides),
+      [tensor_ptr](void*) {});
 }
 
 /**
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
index 511b0ebe582..b71dfab8eeb 100644
--- a/extension/tensor/tensor_ptr_maker.cpp
+++ b/extension/tensor/tensor_ptr_maker.cpp
@@ -96,7 +96,7 @@ TensorPtr random_strided(
     }
   } ctx;
 
-  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "random_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(type, ctx, "random_strided", CTYPE, [&] {
     std::generate_n(tensor->mutable_data_ptr<CTYPE>(), tensor->numel(), [&]() {
       return static_cast<CTYPE>(distribution(gen));
     });
@@ -138,7 +138,7 @@ TensorPtr full_strided(
     }
   } ctx;
 
-  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "full_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(type, ctx, "full_strided", CTYPE, [&] {
     CTYPE value;
     ET_EXTRACT_SCALAR(fill_value, value);
     std::fill(
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 6c98db52d41..5e242e5eb02 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -347,7 +347,7 @@ TEST_F(TensorPtrTest, TensorSharingImplResizingAffectsBothVector) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -357,10 +357,208 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
   EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
+TEST_F(TensorPtrTest, MakeViewOverrideSizesRankIncrease) {
+  std::vector<float> data = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr({2, 3}, std::move(data));
+  auto view = make_tensor_ptr(tensor, {1, 2, 3});
+
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 1);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->size(2), 3);
+  EXPECT_EQ(view->const_data_ptr<float>(), tensor->const_data_ptr<float>());
+  EXPECT_EQ(view->strides()[0], 6);
+  EXPECT_EQ(view->strides()[1], 3);
+  EXPECT_EQ(view->strides()[2], 1);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideSizesSameRankRecomputesStrides) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {4, 3});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 4);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->strides()[0], 3);
+  EXPECT_EQ(view->strides()[1], 1);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideDimOrderOnly) {
+  float data[6] = {0};
+  auto tensor = make_tensor_ptr({2, 3}, data);
+  auto view = make_tensor_ptr(tensor, {}, {1, 0}, {});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 2);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 2);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideStridesOnlyInfersDimOrder) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {}, {1, 3});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewReuseMetadataWhenShapeSame) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data, {1, 0}, {1, 3});
+  auto view = make_tensor_ptr(tensor, {3, 4});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewShapeChangeWithExplicitOldStridesExpectDeath) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  std::vector<executorch::aten::StridesType> old_strides(
+      tensor->strides().begin(), tensor->strides().end());
+
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_ptr(tensor, {2, 6}, {}, old_strides); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewInvalidDimOrderExpectDeath) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_ptr(tensor, {3, 4}, {2, 1}, {1, 4}); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorPtrConvenienceOverload) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {1, 0}, {});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewRankDecreaseFlatten) {
+  float data[6] = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  auto view = make_tensor_ptr(tensor, {6});
+  EXPECT_EQ(view->dim(), 1);
+  EXPECT_EQ(view->size(0), 6);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_NE(tensor->unsafeGetTensorImpl(), view->unsafeGetTensorImpl());
+  EXPECT_EQ(resize_tensor_ptr(view, {3, 2}), Error::NotSupported);
+  EXPECT_EQ(view->dim(), 1);
+  EXPECT_EQ(view->size(0), 6);
+}
+
+TEST_F(TensorPtrTest, MakeViewFromScalarAliasAnd1D) {
+  float scalar_value = 7.f;
+  auto tensor = make_tensor_ptr({}, &scalar_value);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(alias->dim(), 0);
+  EXPECT_EQ(alias->numel(), 1);
+  auto reshaped = make_tensor_ptr(tensor, {1});
+  EXPECT_EQ(reshaped->dim(), 1);
+  EXPECT_EQ(reshaped->size(0), 1);
+  EXPECT_EQ(reshaped->strides()[0], 1);
+  ET_EXPECT_DEATH({ auto unused = make_tensor_ptr(tensor, {}, {0}, {}); }, "");
+  ET_EXPECT_DEATH({ auto unused = make_tensor_ptr(tensor, {}, {}, {1}); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewExplicitDimOrderAndStridesShapeChange) {
+  float data[6] = {0};
+  auto tensor = make_tensor_ptr({2, 3}, data);
+  auto view = make_tensor_ptr(tensor, {3, 2}, {1, 0}, {1, 3});
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataInt16Type) {
+  std::vector<int16_t> int16_values = {-1, 2, -3, 4};
+  auto byte_pointer = reinterpret_cast<const uint8_t*>(int16_values.data());
+  std::vector<uint8_t> byte_data(
+      byte_pointer, byte_pointer + int16_values.size() * sizeof(int16_t));
+  auto tensor = make_tensor_ptr(
+      {4}, std::move(byte_data), executorch::aten::ScalarType::Short);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  auto int16_data = tensor->const_data_ptr<int16_t>();
+  EXPECT_EQ(int16_data[0], -1);
+  EXPECT_EQ(int16_data[1], 2);
+  EXPECT_EQ(int16_data[2], -3);
+  EXPECT_EQ(int16_data[3], 4);
+}
+
+TEST_F(TensorPtrTest, MakeView3DDimOrderOnly) {
+  float data[24] = {0};
+  auto tensor = make_tensor_ptr({2, 3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {2, 0, 1}, {});
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 2);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->size(2), 4);
+  EXPECT_EQ(view->strides()[0], 3);
+  EXPECT_EQ(view->strides()[1], 1);
+  EXPECT_EQ(view->strides()[2], 6);
+}
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrTest, MakeViewDynamismPropagationResizeAlias) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr(
+      {3, 4},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(resize_tensor_ptr(alias, {2, 6}), Error::Ok);
+  EXPECT_EQ(alias->size(0), 2);
+  EXPECT_EQ(alias->size(1), 6);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+}
+
+TEST_F(TensorPtrTest, MakeViewSameRankShapeChangeCopiesDimOrder) {
+  float data[24] = {0};
+  auto tensor = make_tensor_ptr({2, 3, 4}, data, {2, 0, 1}, {3, 1, 6});
+  auto view = make_tensor_ptr(tensor, {4, 2, 3});
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 4);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->size(2), 3);
+  EXPECT_EQ(view->strides()[0], 2);
+  EXPECT_EQ(view->strides()[1], 1);
+  EXPECT_EQ(view->strides()[2], 8);
+}
+#endif
+
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -373,6 +571,56 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
   EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int32_t>(), tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<double>(), tensor->const_data_ptr<double>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int64_t>(), tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrNull) {
+  auto tensor = make_tensor_ptr({2, 2}, nullptr);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(new_tensor->const_data_ptr(), tensor->const_data_ptr());
+  EXPECT_EQ(new_tensor->const_data_ptr(), nullptr);
+}
+
 TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
@@ -392,7 +640,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
   std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -405,7 +653,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorDouble) {
   std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -437,7 +685,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrDouble) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
   std::vector<int64_t> data = {100, 200, 300, 400};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -450,7 +698,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt64) {
   std::vector<int64_t> data = {100, 200, 300, 400};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -753,7 +1001,7 @@ TEST_F(TensorPtrTest, TensorDeducedScalarType) {
   EXPECT_EQ(tensor->const_data_ptr<double>()[3], 4.0);
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
+TEST_F(TensorPtrTest, TensorUint8dataWithFloatScalarType) {
   std::vector<uint8_t> data(
       4 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
 
@@ -777,19 +1025,87 @@ TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
   EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0f);
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferTooSmallExpectDeath) {
+TEST_F(TensorPtrTest, TensorUint8dataTooSmallExpectDeath) {
   std::vector<uint8_t> data(
       2 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   ET_EXPECT_DEATH(
       { auto tensor = make_tensor_ptr({2, 2}, std::move(data)); }, "");
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferTooLargeExpectDeath) {
+TEST_F(TensorPtrTest, TensorUint8dataTooLargeExpectDeath) {
   std::vector<uint8_t> data(
       5 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2, 2}, std::move(data)); }, "");
 }
 
+TEST_F(TensorPtrTest, MakeViewFromTensorPtrKeepsSourceAlive) {
+  bool freed = false;
+  auto* data = new float[6]{1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(tensor);
+  tensor.reset();
+  EXPECT_FALSE(freed);
+  EXPECT_EQ(view->const_data_ptr<float>()[0], 1.0f);
+  view->mutable_data_ptr<float>()[0] = 42.0f;
+  EXPECT_EQ(view->const_data_ptr<float>()[0], 42.0f);
+  view.reset();
+  EXPECT_TRUE(freed);
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorDoesNotKeepAliveByDefault) {
+  bool freed = false;
+  auto* data = new float[2]{7.0f, 8.0f};
+  auto tensor = make_tensor_ptr(
+      {2, 1},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(*tensor);
+  auto raw = view->const_data_ptr<float>();
+  EXPECT_EQ(raw, data);
+  tensor.reset();
+  EXPECT_TRUE(freed);
+  view.reset();
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorWithDeleterKeepsAlive) {
+  bool freed = false;
+  auto* data = new float[3]{1.0f, 2.0f, 3.0f};
+  auto tensor = make_tensor_ptr(
+      {3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(*tensor, {}, {}, {}, [tensor](void*) {});
+  tensor.reset();
+  EXPECT_FALSE(freed);
+  EXPECT_EQ(view->const_data_ptr<float>()[2], 3.0f);
+  view.reset();
+  EXPECT_TRUE(freed);
+}
+
 TEST_F(TensorPtrTest, VectorFloatTooSmallExpectDeath) {
   std::vector<float> data(9, 1.f);
   ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2, 5}, std::move(data)); }, "");
@@ -825,3 +1141,167 @@ TEST_F(TensorPtrTest, TensorDataCastingInvalidCast) {
       },
       "");
 }
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt16Type) {
+  std::vector<uint16_t> data = {1u, 65535u, 42u, 0u};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 65535u);
+  EXPECT_EQ(ptr[2], 42u);
+  EXPECT_EQ(ptr[3], 0u);
+}
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt32Type) {
+  std::vector<uint32_t> data = {0u, 123u, 4000000000u};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 0u);
+  EXPECT_EQ(ptr[1], 123u);
+  EXPECT_EQ(ptr[2], 4000000000u);
+}
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt64Type) {
+  std::vector<uint64_t> data = {0ull, 1ull, 9000000000000000000ull};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt64);
+  auto ptr = tensor->const_data_ptr<uint64_t>();
+  EXPECT_EQ(ptr[0], 0ull);
+  EXPECT_EQ(ptr[1], 1ull);
+  EXPECT_EQ(ptr[2], 9000000000000000000ull);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataUInt32Type) {
+  std::vector<uint32_t> values = {1u, 4000000000u, 123u};
+  const auto* bytes = reinterpret_cast<const uint8_t*>(values.data());
+  std::vector<uint8_t> raw(bytes, bytes + values.size() * sizeof(uint32_t));
+  auto tensor = make_tensor_ptr(
+      {3}, std::move(raw), executorch::aten::ScalarType::UInt32);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 4000000000u);
+  EXPECT_EQ(ptr[2], 123u);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataUInt64Type) {
+  std::vector<uint64_t> values = {0ull, 42ull, 9000000000000000000ull};
+  const auto* bytes = reinterpret_cast<const uint8_t*>(values.data());
+  std::vector<uint8_t> raw(bytes, bytes + values.size() * sizeof(uint64_t));
+  auto tensor = make_tensor_ptr(
+      {3}, std::move(raw), executorch::aten::ScalarType::UInt64);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt64);
+  auto ptr = tensor->const_data_ptr<uint64_t>();
+  EXPECT_EQ(ptr[0], 0ull);
+  EXPECT_EQ(ptr[1], 42ull);
+  EXPECT_EQ(ptr[2], 9000000000000000000ull);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataSizeMismatchUInt32ExpectDeath) {
+  std::vector<uint8_t> data(
+      3 * executorch::aten::elementSize(executorch::aten::ScalarType::UInt32) -
+      1);
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({3}, std::move(data)); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataSizeMismatchUInt64ExpectDeath) {
+  std::vector<uint8_t> data(
+      2 * executorch::aten::elementSize(executorch::aten::ScalarType::UInt64) +
+      1);
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2}, std::move(data)); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromInt32ToUInt16) {
+  std::vector<int32_t> data = {-1, 65535, 65536, -65536};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::UInt16);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], static_cast<uint16_t>(-1));
+  EXPECT_EQ(ptr[1], static_cast<uint16_t>(65535));
+  EXPECT_EQ(ptr[2], static_cast<uint16_t>(65536));
+  EXPECT_EQ(ptr[3], static_cast<uint16_t>(-65536));
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromUInt32ToFloat) {
+  std::vector<uint32_t> data = {0u, 123u, 4000000000u};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  auto ptr = tensor->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(ptr[0], 0.0f);
+  EXPECT_FLOAT_EQ(ptr[1], 123.0f);
+  EXPECT_FLOAT_EQ(ptr[2], 4000000000.0f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromFloatToUInt32) {
+  std::vector<float> data = {1.0f, 2.0f};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::UInt32);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 2u);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorUInt32) {
+  std::vector<uint32_t> data = {10u, 20u, 30u, 40u};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(alias->dim(), 2);
+  EXPECT_EQ(alias->size(0), 2);
+  EXPECT_EQ(alias->size(1), 2);
+  EXPECT_EQ(alias->scalar_type(), executorch::aten::ScalarType::UInt32);
+  EXPECT_EQ(
+      alias->const_data_ptr<uint32_t>(), tensor->const_data_ptr<uint32_t>());
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorUInt32) {
+  std::vector<uint32_t> data = {10u, 20u, 30u, 40u};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned = clone_tensor_ptr(tensor);
+  EXPECT_EQ(cloned->dim(), 2);
+  EXPECT_EQ(cloned->size(0), 2);
+  EXPECT_EQ(cloned->size(1), 2);
+  EXPECT_EQ(cloned->scalar_type(), executorch::aten::ScalarType::UInt32);
+  EXPECT_NE(
+      cloned->const_data_ptr<uint32_t>(), tensor->const_data_ptr<uint32_t>());
+  auto ptr = cloned->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 10u);
+  EXPECT_EQ(ptr[3], 40u);
+}
+
+TEST_F(TensorPtrTest, Tensor2DUInt16OwningData) {
+  std::vector<uint16_t> data = {1u, 2u, 3u, 4u, 5u, 6u};
+  auto tensor = make_tensor_ptr({2, 3}, std::move(data));
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->strides()[0], 3);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[5], 6u);
+}
diff --git a/extension/testing_util/targets.bzl b/extension/testing_util/targets.bzl
index 05b825645e8..a5ad1fb9b8c 100644
--- a/extension/testing_util/targets.bzl
+++ b/extension/testing_util/targets.bzl
@@ -14,6 +14,7 @@ def define_common_targets():
         visibility = [
             "//executorch/devtools/etdump/tests/...",
             "//executorch/extension/data_loader/test/...",
+            "//executorch/extension/llm/runner/test/...",
             "//executorch/extension/testing_util/test/...",
             "//executorch/extension/fb/ptez/decompression_methods/test/...",
             "//executorch/extension/fb/ptez/test/...",
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index a6c06e84293..3b9c7c66ddb 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -20,6 +20,16 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
+# Threadpool size specifiers. Mutual exclusion is checking in default.cmake.
+# Default to using performance cores if
+# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set.
+set(_threadpool_size_flag)
+if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
+  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES")
+else()
+  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES")
+endif()
+
 add_library(
   extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
                        cpuinfo_utils.cpp
@@ -36,14 +46,16 @@ target_include_directories(
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include>
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include>
 )
-target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
+target_compile_definitions(
+  extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag}
+)
 target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 
 # Install libraries
 install(
   TARGETS extension_threadpool
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 6ef55c42434..1889cb650ad 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -22,6 +22,7 @@ def define_common_targets():
         name = "threadpool_lib",
         srcs = _THREADPOOL_SRCS,
         deps = [
+            ":cpuinfo_utils",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
diff --git a/extension/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp
index e7784d3cc11..052e6c22f5e 100644
--- a/extension/threadpool/test/threadpool_test.cpp
+++ b/extension/threadpool/test/threadpool_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/platform/runtime.h>
 
 #include <mutex>
 #include <numeric>
@@ -71,6 +72,8 @@ void run_lambda_with_size(
 } // namespace
 
 TEST(ThreadPoolTest, ParallelAdd) {
+  executorch::runtime::runtime_init();
+
   std::vector<int32_t> a, b, c, c_ref;
   size_t vector_size = 100;
   size_t grain_size = 10;
@@ -111,6 +114,8 @@ TEST(ThreadPoolTest, ParallelAdd) {
 
 // Test parallel reduction where we acquire lock within lambda
 TEST(ThreadPoolTest, ParallelReduce) {
+  executorch::runtime::runtime_init();
+
   std::vector<int32_t> a;
   int32_t c = 0, c_ref = 0;
   size_t vector_size = 100;
@@ -144,6 +149,8 @@ TEST(ThreadPoolTest, ParallelReduce) {
 // Copied from
 // caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
 TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
+  executorch::runtime::runtime_init();
+
   auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();
 
   ASSERT_NE(threadpool_ptr, nullptr);
@@ -173,6 +180,8 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
 }
 
 TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
+  executorch::runtime::runtime_init();
+
   const std::vector<int64_t> array = {1, 2, 3};
 
   auto pool = ::executorch::extension::threadpool::get_threadpool();
diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index e9f3b0f5f4a..f4d88e668d6 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
 
 #include <algorithm>
@@ -13,9 +14,26 @@
 
 #include <executorch/extension/threadpool/threadpool_guard.h>
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/runtime.h>
 
 #include <cpuinfo.h>
 
+// At most one mode should be set.
+#if (                                                       \
+    defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
+    defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES))
+#error Multiple \
+            threadpool size specifiers are set.At most one of                \
+    EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES,                             \
+    and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined.
+#endif
+
+// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set.
+#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
+    !defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)
+#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1
+#endif
+
 namespace executorch::extension::threadpool {
 
 #if !(defined(WIN32))
@@ -44,6 +62,8 @@ size_t ThreadPool::get_thread_count() const {
 }
 
 bool ThreadPool::_unsafe_reset_threadpool(uint32_t new_thread_count) {
+  ET_LOG(Info, "Resetting threadpool to %u threads.", new_thread_count);
+
   // No need to do anything if the count is same or 0
   if (new_thread_count == get_thread_count() || new_thread_count == 0) {
     return true;
@@ -95,13 +115,21 @@ void ThreadPool::run(
 // get_threadpool is not thread safe due to leak_corrupted_threadpool
 // Make this part threadsafe: TODO(kimishpatel)
 ThreadPool* get_threadpool() {
+  executorch::runtime::runtime_init();
+
   if (!cpuinfo_initialize()) {
     ET_LOG(Error, "cpuinfo initialization failed");
     return nullptr; // NOLINT(facebook-hte-NullableReturn)
   }
 
   static const int num_threads = ([]() {
-    int result = cpuinfo_get_processors_count();
+#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
+    // Use threads=cores.
+    auto result = cpuinfo_get_processors_count();
+#else
+    // Set threads equal to the number of performance cores.
+    auto result = ::executorch::extension::cpuinfo::get_num_performant_cores();
+#endif
 
     /*
      * For llvm-tsan, holding limit for the number of locks for a single thread
@@ -111,9 +139,10 @@ ThreadPool* get_threadpool() {
      * tricky to detect if we are running under tsan, for now capping the
      * default threadcount to the tsan limit unconditionally.
      */
-    constexpr int tsan_thread_limit = 63;
+    constexpr unsigned int tsan_thread_limit = 63;
     return std::min(result, tsan_thread_limit);
   })();
+
   static auto threadpool = std::make_unique<ThreadPool>(num_threads);
 
 // Inheriting from old threadpool to get around segfault issue
diff --git a/extension/threadpool/threadpool.h b/extension/threadpool/threadpool.h
index 3ad2d1d48d4..16acad6e5fa 100644
--- a/extension/threadpool/threadpool.h
+++ b/extension/threadpool/threadpool.h
@@ -14,6 +14,22 @@
 
 #include <pthreadpool.h>
 
+/*
+ * Threadpool Options:
+ *
+ * Threadpool size has a sizble affect on performance. By default, the
+ * threadpool will be sized according to the number of performance cores. This
+ * behavior can be overriden with the following build-time options. Note that
+ * these options are mutually exclusive.
+ *
+ * - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool
+ * equal to the number of performance cores on the system. This is the default
+ * behavior.
+ * - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool
+ * equal to the number of logical cores on system. This is the historical
+ * behavior.
+ */
+
 namespace executorch::extension::threadpool {
 
 class ThreadPool final {
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
index 1e17913141d..8f572514aa5 100644
--- a/extension/training/CMakeLists.txt
+++ b/extension/training/CMakeLists.txt
@@ -70,7 +70,14 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(
-    _training_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
+    _training_lib
+    PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+           /GR
+           /wd4996>
+           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+           -fPIC
+           -frtti
+           -fexceptions>
   )
   target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
 
@@ -83,7 +90,7 @@ endif()
 install(
   TARGETS extension_training
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/wasm/CMakeLists.txt b/extension/wasm/CMakeLists.txt
index 36c336e17c5..8ffd1801c63 100644
--- a/extension/wasm/CMakeLists.txt
+++ b/extension/wasm/CMakeLists.txt
@@ -27,7 +27,15 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/W4
+    /WX
+    /wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall
+    -Werror
+    -Wno-deprecated-declarations
+    -fPIC>
+)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 set(link_libraries)
diff --git a/install_requirements.py b/install_requirements.py
index b84e250cf87..a026e5b9964 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,33 +12,12 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-from torch_pin import NIGHTLY_VERSION, TORCH_VERSION
+from torch_pin import NIGHTLY_VERSION, SUPPORTED_CUDA_VERSIONS, TORCH_VERSION
 
 # The pip repository that hosts nightly torch packages.
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
 TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
 
-# Supported CUDA versions - modify this to add/remove supported versions
-# Format: tuple of (major, minor) version numbers
-SUPPORTED_CUDA_VERSIONS = (
-    (12, 6),
-    (12, 8),
-    (13, 0),
-)
-
-# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
-# pip versions will have the required features.
-#
-# NOTE: If a newly-fetched version of the executorch repo changes the value of
-# NIGHTLY_VERSION, you should re-run this script to install the necessary
-# package versions.
-#
-# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
-# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-#
-# NOTE: If you're changing, make the corresponding supported CUDA versions in
-# SUPPORTED_CUDA_VERSIONS above if needed.
-
 
 def install_requirements(use_pytorch_nightly):
     # Skip pip install on Intel macOS if using nightly.
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 32ae865bfdf..f87e2c8d722 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -83,6 +83,7 @@ install(
   # it.
   TARGETS cpublas optimized_kernels optimized_ops_lib eigen_blas
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/optimized/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/optimized/
 )
diff --git a/kernels/optimized/External/EigenBLAS.cmake b/kernels/optimized/External/EigenBLAS.cmake
index 29d42478798..bc09786bed4 100644
--- a/kernels/optimized/External/EigenBLAS.cmake
+++ b/kernels/optimized/External/EigenBLAS.cmake
@@ -53,6 +53,6 @@ set_property(TARGET eigen_blas PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 install(
   TARGETS eigen_blas
-  LIBRARY DESTINATION lib
-  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index eb8475b8d5a..a3ab1654ee5 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -91,13 +91,14 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   install(
     TARGETS optimized_portable_kernels optimized_portable_ops_lib
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
 
 install(
   TARGETS portable_kernels portable_ops_lib
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/portable/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/portable/
 )
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index b0c837cdefd..d4fc52af76b 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -152,6 +152,7 @@ gen_operators_lib(
 install(
   TARGETS quantized_kernels quantized_ops_lib
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/quantized/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/quantized/
 )
diff --git a/pyproject.toml b/pyproject.toml
index fbed875a824..393f8578c8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,10 @@
 [build-system]
 requires = [
   "cmake>=3.29,<4.0.0",  # For building binary targets in the wheel. 4.0.0 breaks third-party CMake build so temporarily pin the version.
+  "packaging>=24.2", # Lower bound required by setuptools
   "pip>=23",  # For building the pip package.
   "pyyaml",  # Imported by the kernel codegen tools.
-  "setuptools>=63",  # For building the pip package contents.
+  "setuptools>=77.0.3",  # For building the pip package contents.
   "wheel",  # For building the pip package archive.
   "zstd",  # Imported by resolve_buck.py.
   "certifi",  # Imported by resolve_buck.py.
@@ -21,7 +22,8 @@ readme = "README-wheel.md"
 authors = [
   {name="PyTorch Team", email="packages@pytorch.org"},
 ]
-license = {file = "LICENSE"}
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
 keywords = ["pytorch", "machine learning"]
 # PyPI package information.
 classifiers = [
@@ -29,11 +31,10 @@ classifiers = [
     #   3 - Alpha
     #   4 - Beta
     #   5 - Production/Stable
-    "Development Status :: 4 - Beta",
+    "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: BSD License",
     "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Mathematics",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9df5e7b93ed..258a898894c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,10 @@
 # Pip packages needed to build from source. Mainly for development of ExecuTorch.
 
 cmake>=3.29, <4.0.0  # For building binary targets in the wheel.
+packaging>=24.2 # Lower bound required by setuptools
 pip>=23  # For building the pip package.
 pyyaml  # Imported by the kernel codegen tools.
-setuptools>=63  # For building the pip package contents.
+setuptools>=77.0.3  # For building the pip package contents.
 wheel  # For building the pip package archive.
 zstd  # Imported by resolve_buck.py.
 certifi  # Imported by resolve_buck.py.
diff --git a/runtime/core/freeable_buffer.h b/runtime/core/freeable_buffer.h
index a90c899103d..c743f32116a 100644
--- a/runtime/core/freeable_buffer.h
+++ b/runtime/core/freeable_buffer.h
@@ -9,6 +9,12 @@
 #pragma once
 
 #include <cstddef>
+#include <cstdint>
+#include <variant>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
 namespace runtime {
@@ -20,20 +26,35 @@ class FreeableBuffer final {
  public:
   // Callback signature for the function that does the freeing.
   using FreeFn = void (*)(void* context, void* data, size_t size);
+  using FreeUInt64Fn =
+      void (*)(void* context, uint64_t data_uint64, size_t size);
+
+ private:
+  // Forward declare types.
+  struct PointerData {
+    const void* data_;
+    FreeFn free_fn_;
+  };
 
+  struct UInt64Data {
+    // A pointer value cast to uint64_t.
+    uint64_t data_;
+    FreeUInt64Fn free_fn_;
+  };
+
+ public:
   /**
    * Creates an empty FreeableBuffer with size zero and a null data pointer.
    */
   FreeableBuffer()
-      : free_fn_(nullptr),
+      : data_(PointerData{nullptr, nullptr}),
         free_fn_context_(nullptr),
-        data_(nullptr),
         size_(0) {}
 
   /**
    * Creates a FreeableBuffer with an optional free function.
    *
-   * @param[in] data The data of the segment.
+   * @param[in] data The data of the segment, as a void*.
    * @param[in] size The size of the segment data, in bytes.
    * @param[in] free_fn Optional function to free the data. Guaranteed to be
    *     called exactly once before the FreeableBuffer is destroyed. May be
@@ -47,9 +68,35 @@ class FreeableBuffer final {
       size_t size,
       FreeFn free_fn,
       void* free_fn_context = nullptr)
-      : free_fn_(free_fn),
+      : data_(PointerData{data, free_fn}),
+        free_fn_context_(free_fn_context),
+        size_(size) {}
+
+  /**
+   * Creates a FreeableBuffer with an optional free function.
+   *
+   * NOTE: most users should use the other ctor with FreeFn.
+   * This variant exists for situations where the FreeableBuffer points to
+   * memory on a different core whose pointer value is larger than the local
+   * core's void*.
+   *
+   * @param[in] data Pointer to the data of the segment, cast to a uint64_t
+   * value.
+   * @param[in] size The size of the segment data, in bytes.
+   * @param[in] free_fn Optional function to free the data. Guaranteed to be
+   *     called exactly once before the FreeableBuffer is destroyed. May be
+   *     nullptr. NOTE: This function must be thread-safe. If it modifies common
+   *     state, the function must do its own locking.
+   * @param[in] free_fn_context Opaque pointer to pass as the `context`
+   *     parameter of `free_fn`. May be nullptr.
+   */
+  explicit FreeableBuffer(
+      const uint64_t data_uint64,
+      size_t size,
+      FreeUInt64Fn free_fn,
+      void* free_fn_context = nullptr)
+      : data_(UInt64Data{data_uint64, free_fn}),
         free_fn_context_(free_fn_context),
-        data_(data),
         size_(size) {}
 
   /**
@@ -57,13 +104,15 @@ class FreeableBuffer final {
    * leaving `rhs` pointing to nullptr.
    */
   FreeableBuffer(FreeableBuffer&& rhs) noexcept
-      : free_fn_(rhs.free_fn_),
+      : data_(rhs.data_),
         free_fn_context_(rhs.free_fn_context_),
-        data_(rhs.data_),
         size_(rhs.size_) {
-    rhs.free_fn_ = nullptr;
+    if (std::holds_alternative<PointerData>(rhs.data_)) {
+      rhs.data_ = PointerData{nullptr, nullptr};
+    } else {
+      rhs.data_ = UInt64Data{0, nullptr};
+    }
     rhs.free_fn_context_ = nullptr;
-    rhs.data_ = nullptr;
     rhs.size_ = 0;
   }
 
@@ -75,11 +124,22 @@ class FreeableBuffer final {
    * Frees the data if not already free. Safe to call multiple times.
    */
   void Free() {
-    if (data_ != nullptr) {
-      if (free_fn_ != nullptr) {
-        free_fn_(free_fn_context_, const_cast<void*>(data_), size_);
+    if (std::holds_alternative<PointerData>(data_)) {
+      PointerData& ptr_data = std::get<PointerData>(data_);
+      if (ptr_data.data_ != nullptr && ptr_data.free_fn_ != nullptr) {
+        // Do not need to check for truncation here, as free_fn_ is only set
+        // using the void* ctor.
+        ptr_data.free_fn_(
+            free_fn_context_, const_cast<void*>(ptr_data.data_), size_);
       }
-      data_ = nullptr;
+      ptr_data.data_ = nullptr;
+      size_ = 0;
+    } else {
+      UInt64Data& int64_data = std::get<UInt64Data>(data_);
+      if (int64_data.data_ != 0 && int64_data.free_fn_ != nullptr) {
+        int64_data.free_fn_(free_fn_context_, int64_data.data_, size_);
+      }
+      int64_data.data_ = static_cast<uint64_t>(0);
       size_ = 0;
     }
   }
@@ -95,7 +155,37 @@ class FreeableBuffer final {
    * Pointer to the data. Returns nullptr if the data has been freed.
    */
   const void* data() const {
-    return data_;
+    ET_CHECK_MSG(
+        std::holds_alternative<PointerData>(data_),
+        "FreeableBuffer is backed by an uint64_t, please use the data_uint64_type() API.");
+    return std::get<PointerData>(data_).data_;
+  }
+
+  /**
+   * Pointer to the data. Returns nullptr if the data has been freed.
+   * Safe version of data() API that returns an ERror if the data is
+   * backed by int64_t instead of void*.
+   */
+  Result<const void*> data_safe() const {
+    ET_CHECK_OR_RETURN_ERROR(
+        std::holds_alternative<PointerData>(data_),
+        InvalidType,
+        "FreeableBuffer is backed by an uint64_t, please use the data_uint64_type() API.");
+    return std::get<PointerData>(data_).data_;
+  }
+
+  /**
+   * Data address as a uint64_t. Returns zero if the data has been freed.
+   * Most users should use data(). data_uint64_type() is only helpful in
+   * situations where the FreeableBuffer points to memory on a different core
+   * whose pointer value is larger than the local core's void *.
+   */
+  Result<uint64_t> data_uint64_type() const {
+    ET_CHECK_OR_RETURN_ERROR(
+        std::holds_alternative<UInt64Data>(data_),
+        InvalidType,
+        "FreeableBuffer is backed by a void*, please use the data() API.");
+    return std::get<UInt64Data>(data_).data_;
   }
 
  private:
@@ -104,9 +194,15 @@ class FreeableBuffer final {
   FreeableBuffer& operator=(FreeableBuffer&& rhs) noexcept = delete;
   FreeableBuffer& operator=(const FreeableBuffer& rhs) = delete;
 
-  FreeFn free_fn_;
+  // This stores either a PointerData or a UInt64Data structure. Most users
+  // should use the PointerData variant and the void* ctor. This creates a
+  // FreeableBuffer backed by void*, accessed using the void* getter data().
+  // The UInt64Data variant is only helpful in situations where the
+  // FreeableBuffer points to memory on a different core whose pointer value
+  // is larger than the local core's void*.
+  std::variant<PointerData, UInt64Data> data_;
+
   void* free_fn_context_;
-  const void* data_;
   size_t size_;
 };
 
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index 558edb175ae..e340e7626a0 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -359,6 +359,7 @@ static inline int C10_WARP_SIZE_INTERNAL() {
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)
 #define SYCL_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
 #if defined(NDEBUG)
@@ -396,6 +397,26 @@ __host__ __device__
                static_cast<unsigned>(__LINE__)), \
            0);                                   \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                     \
+  if (C10_UNLIKELY(!(cond))) {                                        \
+    (void)(printf(                                                    \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(           \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: " \
+                      "Assertion failed: `" #cond "`: " msg "\n",     \
+        __func__,                                                     \
+        blockIdx.x,                                                   \
+        blockIdx.y,                                                   \
+        blockIdx.z,                                                   \
+        threadIdx.x,                                                  \
+        threadIdx.y,                                                  \
+        threadIdx.z,                                                  \
+        ##__VA_ARGS__));                                              \
+    (void)(_wassert(                                                  \
+               _CRT_WIDE(#cond),                                      \
+               _CRT_WIDE(__FILE__),                                   \
+               static_cast<unsigned>(__LINE__)),                      \
+           0);                                                        \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                 \
   if (C10_UNLIKELY(!(cond))) {                   \
     (void)(_wassert(                             \
@@ -455,6 +476,10 @@ __host__ __device__
   if C10_UNLIKELY (!(cond)) {             \
     abort();                              \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...) \
+  if C10_UNLIKELY (!(cond)) {                     \
+    abort();                                      \
+  }
 #define SYCL_KERNEL_ASSERT(cond) \
   if C10_UNLIKELY (!(cond)) {    \
     abort();                     \
@@ -470,6 +495,23 @@ __host__ __device__
     __assert_fail(                                                     \
         msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                        \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    printf(                                                            \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(            \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: "  \
+            "Assertion failed: `" #cond "`: " msg "\n",                \
+        __func__,                                                      \
+        blockIdx.x,                                                    \
+        blockIdx.y,                                                    \
+        blockIdx.z,                                                    \
+        threadIdx.x,                                                   \
+        threadIdx.y,                                                   \
+        threadIdx.z,                                                   \
+        ##__VA_ARGS__); \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                                         \
   if (C10_UNLIKELY(!(cond))) {                                           \
     __assert_fail(                                                       \
diff --git a/runtime/core/test/freeable_buffer_test.cpp b/runtime/core/test/freeable_buffer_test.cpp
index e2edff24227..2848a6b049d 100644
--- a/runtime/core/test/freeable_buffer_test.cpp
+++ b/runtime/core/test/freeable_buffer_test.cpp
@@ -6,16 +6,21 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/freeable_buffer.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+
+using executorch::runtime::Error;
 using executorch::runtime::FreeableBuffer;
 
 struct FreeCallArgs {
   size_t calls;
-  void* data;
+  std::variant<const void*, uint64_t> data;
   size_t size;
 };
 
@@ -26,9 +31,18 @@ void RecordFree(void* context, void* data, size_t size) {
   call->size = size;
 }
 
+void RecordInt64Free(void* context, uint64_t data, size_t size) {
+  auto* call = reinterpret_cast<FreeCallArgs*>(context);
+  call->calls++;
+  call->data = data;
+  call->size = size;
+}
+
 TEST(FreeableBufferTest, EmptyTest) {
   FreeableBuffer fb;
   EXPECT_EQ(fb.data(), nullptr);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), nullptr);
   EXPECT_EQ(fb.size(), 0);
 }
 
@@ -42,11 +56,33 @@ TEST(FreeableBufferTest, DataAndSizeTest) {
   // It should return the ctor params unmodified.
   EXPECT_EQ(fb.size(), sizeof(i));
   EXPECT_EQ(fb.data(), &i);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), &i);
 
   // Freeing should clear them, even though free_fn is nullptr.
   fb.Free();
   EXPECT_EQ(fb.size(), 0);
   EXPECT_EQ(fb.data(), nullptr);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), nullptr);
+
+  // Use uint64_t constructor.
+  const uint64_t i64 = 1;
+  FreeableBuffer fb2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/nullptr);
+
+  // It should return the ctor params unmodified.
+  EXPECT_EQ(fb2.size(), sizeof(i64));
+  EXPECT_EQ(fb2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb2.data_uint64_type().get(), i64);
+
+  // Freeing should clear them, even though free_fn is nullptr.
+  fb2.Free();
+  EXPECT_EQ(fb2.size(), 0);
+  EXPECT_EQ(fb2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb2.data_uint64_type().get(), 0);
 }
 
 TEST(FreeableBufferTest, FreeTest) {
@@ -68,7 +104,7 @@ TEST(FreeableBufferTest, FreeTest) {
     // Called once during Free() with the expected data/size.
     fb.Free();
     EXPECT_EQ(call.calls, 1);
-    EXPECT_EQ(call.data, &i);
+    EXPECT_EQ(std::get<const void*>(call.data), &i);
     EXPECT_EQ(call.size, sizeof(i));
 
     // A second call to Free() should not call the function again.
@@ -78,6 +114,31 @@ TEST(FreeableBufferTest, FreeTest) {
 
   // The destructor should not have called the function again.
   EXPECT_EQ(call.calls, 1);
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  {
+    uint64_t i64 = 1;
+    FreeableBuffer fb(
+        /*data_uint64=*/i64,
+        /*size=*/sizeof(i64),
+        /*free_fn=*/RecordInt64Free,
+        /*free_fn_context=*/&call2);
+
+    // Not called during construction.
+    EXPECT_EQ(call2.calls, 0);
+
+    // Called once during Free() with the expected data/size.
+    fb.Free();
+    EXPECT_EQ(call2.calls, 1);
+    EXPECT_EQ(std::get<uint64_t>(call2.data), i64);
+    EXPECT_EQ(call2.size, sizeof(i64));
+
+    // A second call to Free() should not call the function again.
+    fb.Free();
+    EXPECT_EQ(call2.calls, 1);
+  }
+  EXPECT_EQ(call2.calls, 1);
 }
 
 TEST(FreeableBufferTest, DestructorTest) {
@@ -99,8 +160,24 @@ TEST(FreeableBufferTest, DestructorTest) {
 
   // The destructor should have freed the data.
   EXPECT_EQ(call.calls, 1);
-  EXPECT_EQ(call.data, &i);
+  EXPECT_EQ(std::get<const void*>(call.data), &i);
   EXPECT_EQ(call.size, sizeof(i));
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  uint64_t i64 = 1;
+  {
+    FreeableBuffer fb2(
+        /*data_uint64=*/i64,
+        /*size=*/sizeof(i),
+        /*free_fn=*/RecordInt64Free,
+        /*free_fn_context=*/&call2);
+    EXPECT_EQ(call2.calls, 0);
+  }
+  // The destructor should have freed the data.
+  EXPECT_EQ(call2.calls, 1);
+  EXPECT_EQ(std::get<uint64_t>(call2.data), i64);
+  EXPECT_EQ(call2.size, sizeof(i));
 }
 
 TEST(FreeableBufferTest, MoveTest) {
@@ -127,7 +204,6 @@ TEST(FreeableBufferTest, MoveTest) {
   // The destination FreeableBuffer should have the data.
   EXPECT_EQ(fb_dst.size(), sizeof(i));
   EXPECT_EQ(fb_dst.data(), &i);
-
   // Freeing the source FreeableBuffer should not call the free function.
   fb_src.Free();
   EXPECT_EQ(call.calls, 0);
@@ -135,6 +211,59 @@ TEST(FreeableBufferTest, MoveTest) {
   // Freeing the destination FreeableBuffer should call the free function.
   fb_dst.Free();
   EXPECT_EQ(call.calls, 1);
-  EXPECT_EQ(call.data, &i);
   EXPECT_EQ(call.size, sizeof(i));
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  const uint64_t i64 = 1;
+  FreeableBuffer fb_src2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/RecordInt64Free,
+      /*free_fn_context=*/&call2);
+  EXPECT_EQ(fb_src2.size(), sizeof(i64));
+  EXPECT_EQ(fb_src2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb_src2.data_uint64_type().get(), i64);
+
+  // Move it into a second FreeableBuffer.
+  FreeableBuffer fb_dst2(std::move(fb_src2));
+
+  // The source FreeableBuffer should now be empty.
+  EXPECT_EQ(fb_src2.size(), 0); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(
+      fb_src2.data_uint64_type().error(),
+      Error::Ok); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(
+      fb_src2.data_uint64_type().get(), 0); // NOLINT(bugprone-use-after-move)
+
+  // The destination FreeableBuffer should have the data.
+  EXPECT_EQ(fb_dst2.size(), sizeof(i64));
+  EXPECT_EQ(fb_dst2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb_dst2.data_uint64_type().get(), i64);
+  // Freeing the source FreeableBuffer should not call the free function.
+  fb_src2.Free();
+  EXPECT_EQ(call2.calls, 0);
+
+  // Freeing the destination FreeableBuffer should call the free function.
+  fb_dst2.Free();
+  EXPECT_EQ(call2.calls, 1);
+  EXPECT_EQ(call2.size, sizeof(i64));
+}
+
+TEST(FreeableBufferTest, APIMisuseDeathTest) {
+  executorch::runtime::pal_init();
+  int i;
+  FreeableBuffer fb(
+      /*data=*/&i,
+      /*size=*/sizeof(i),
+      /*free_fn=*/nullptr);
+  EXPECT_EQ(fb.data_uint64_type().error(), Error::InvalidType);
+
+  uint64_t i64 = 1;
+  FreeableBuffer fb2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/nullptr);
+  EXPECT_EQ(fb2.data_safe().error(), Error::InvalidType);
+  ET_EXPECT_DEATH(fb2.data(), ".*");
 }
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index 05d149ab1b4..5477831923c 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -191,4 +191,4 @@ target_link_libraries(test_backend_compiler_lib PUBLIC executorch_core)
 
 executorch_target_link_options_shared_lib(test_backend_compiler_lib)
 
-install(TARGETS test_backend_compiler_lib DESTINATION lib)
+install(TARGETS test_backend_compiler_lib DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/runtime/platform/log.h b/runtime/platform/log.h
index 72ea8528442..7293fa2428d 100644
--- a/runtime/platform/log.h
+++ b/runtime/platform/log.h
@@ -181,6 +181,20 @@ using ::executorch::runtime::LogLevel;
           ##__VA_ARGS__);                                            \
     }                                                                \
   } while (0)
+
+/**
+ * Check a condition and log an error message if the condition is false.
+ *
+ * @param[in] _condition The condition to check.
+ * @param[in] _format Log message format string.
+ */
+#define ET_CHECK_OR_LOG_ERROR(_condition, _format, ...) \
+  do {                                                  \
+    if (!(_condition)) {                                \
+      ET_LOG(Error, _format, ##__VA_ARGS__);            \
+    }                                                   \
+  } while (0)
+
 #else // ET_LOG_ENABLED
 
 /**
@@ -191,4 +205,12 @@ using ::executorch::runtime::LogLevel;
  */
 #define ET_LOG(_level, _format, ...) ((void)0)
 
+/**
+ * Check a condition and log an error message if the condition is false.
+ *
+ * @param[in] _condition The condition to check.
+ * @param[in] _format Log message format string.
+ */
+#define ET_CHECK_OR_LOG_ERROR(_condition, _format, ...) ((void)0)
+
 #endif // ET_LOG_ENABLED
diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
index 8ce2d68bab8..63fa4cf4545 100755
--- a/scripts/build_apple_frameworks.sh
+++ b/scripts/build_apple_frameworks.sh
@@ -31,6 +31,7 @@ libextension_apple.a,\
 libextension_data_loader.a,\
 libextension_flat_tensor.a,\
 libextension_module.a,\
+libextension_named_data_map.a,\
 libextension_tensor.a,\
 :${FRAMEWORK_EXECUTORCH_HEADERS_DIR}:${FRAMEWORK_EXECUTORCH_MODULE_NAME}"
 
diff --git a/scripts/build_wasm_tests.sh b/scripts/build_wasm_tests.sh
index 9a09ddd2749..4dd7355e118 100644
--- a/scripts/build_wasm_tests.sh
+++ b/scripts/build_wasm_tests.sh
@@ -22,6 +22,7 @@ emcmake cmake . -DEXECUTORCH_BUILD_WASM=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_SELECT_OPS_LIST="aten::mm.out,aten::add.out" \
     -DEXECUTORCH_BUILD_TESTS=ON \
diff --git a/scripts/pick_doc_commits.py b/scripts/pick_doc_commits.py
index 85958c36977..accec00dda3 100755
--- a/scripts/pick_doc_commits.py
+++ b/scripts/pick_doc_commits.py
@@ -129,7 +129,7 @@ def is_doc_file(path: str) -> bool:
     all_files = frozenset(lines[1:])
     doc_files = frozenset(filter(is_doc_file, all_files))
     non_doc_files = all_files - doc_files
-    is_doc_only = all_files == doc_files
+    is_doc_only = (all_files == doc_files) and len(all_files) > 0
 
     if verbosity > 0 and not is_doc_only:
         debug_log(
diff --git a/scripts/test_ios.sh b/scripts/test_ios.sh
index 8cb86f8f43c..599ae1683a4 100755
--- a/scripts/test_ios.sh
+++ b/scripts/test_ios.sh
@@ -15,7 +15,7 @@ set -e
 
 OUTPUT="${1:-executorch}"
 EXIT_STATUS=0
-APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo"
+APP_PATH="executorch-examples/mv3/apple/ExecuTorchDemo/ExecuTorchDemo"
 MODEL_NAME="mv3"
 SIMULATOR_NAME="executorch"
 
diff --git a/setup.py b/setup.py
index fe9543f3243..97a1d05096e 100644
--- a/setup.py
+++ b/setup.py
@@ -467,11 +467,10 @@ def run(self):
             # Following code is for building the Qualcomm backend.
             from backends.qualcomm.scripts.download_qnn_sdk import (
                 _download_qnn_sdk,
-                check_glibc_exist_and_validate,
                 is_linux_x86,
             )
 
-            if is_linux_x86() and check_glibc_exist_and_validate():
+            if is_linux_x86():
                 os.environ["EXECUTORCH_BUILDING_WHEEL"] = "1"
 
                 with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index ea086886449..8d8893f7454 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -341,6 +341,10 @@ EXTENSION_MODULE_SRCS = [
     "extension/module/module.cpp",
 ]
 
+EXTENSION_NAMED_DATA_MAP_SRCS = [
+    "extension/named_data_map/merged_data_map.cpp",
+]
+
 EXTENSION_RUNNER_UTIL_SRCS = [
     "extension/runner_util/inputs.cpp",
     "extension/runner_util/inputs_portable.cpp",
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index 3546b64cdb6..0002884b2a4 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -1,12 +1,12 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_default_executorch_platforms", "is_xplat", "runtime", "struct_to_json")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
-load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
     "get_vec_deps",
     "get_vec_preprocessor_flags",
 )
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
+load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
 load("@fbsource//xplat/executorch/kernels/prim_ops:selective_build.bzl", "prim_ops_registry_selective")
 
 # Headers that declare the function signatures of the C++ functions that
@@ -96,15 +96,17 @@ def _get_prim_ops_registry_target(name, deps, aten_suffix, platforms):
     Returns:
         String: Target name for the appropriate prim ops registry
     """
+
     # If selective build targets are specified, create a selective prim ops registry
     # Create a selective prim ops registry using the existing function
     selective_prim_ops_registry_name = name + "_selected_prim_ops_registry"
     combined_prim_ops_header_target_name = name + "_combined_prim_ops_header"
     selected_prim_operators_genrule(combined_prim_ops_header_target_name, deps, platforms)
+
     # Use the existing prim_ops_registry_selective function
     prim_ops_registry_selective(
         name = selective_prim_ops_registry_name,
-        selected_prim_ops_header_target = ":"+combined_prim_ops_header_target_name,
+        selected_prim_ops_header_target = ":" + combined_prim_ops_header_target_name,
         aten_suffix = aten_suffix,
         platforms = platforms,
     )
@@ -123,11 +125,16 @@ def _extract_prim_ops_from_lists(ops, ops_dict):
     Returns:
         Tuple of (prim_ops, remaining_ops, remaining_ops_dict)
     """
+
     def _is_aten_prim_op(op_name):
         if not op_name.startswith("aten::"):
             return False
         for prim_suffix in [
-            "sym_size", "sym_numel", "sym_max", "sym_min", "sym_float"
+            "sym_size",
+            "sym_numel",
+            "sym_max",
+            "sym_min",
+            "sym_float",
         ]:
             if prim_suffix in op_name:
                 return True
@@ -169,7 +176,6 @@ def et_operator_library(
         ops_schema_yaml_target = None,
         server_generated_yaml_target = None,
         **kwargs):
-
     # Check if we should extract prim ops from the operator lists
     # Note that selective build for prim ops doesnt support model or ops_schema_yaml_target or server_generated_yaml_target
     # TODO: Add support for selective build for prim ops with model or ops_schema_yaml_target or server_generated_yaml_target
@@ -178,6 +184,7 @@ def et_operator_library(
     if should_extract_prim_ops:
         # Extract prim ops from ops and ops_dict
         prim_ops, remaining_ops, remaining_ops_dict = _extract_prim_ops_from_lists(ops, ops_dict)
+
         # Use the remaining ops (with prim ops removed) for the main et_operator_library
         final_ops = remaining_ops
         final_ops_dict = remaining_ops_dict
@@ -189,6 +196,7 @@ def et_operator_library(
 
     selected_operator_yaml_filename = "selected_operators.yaml"
     selected_prim_ops_filename = "selected_prim_ops.h"
+
     # Generate the main operator library with the final ops
     # do a dummy copy if server_generated_yaml_target is set
     if server_generated_yaml_target:
@@ -231,6 +239,7 @@ def et_operator_library(
         "--prim_op_names=" + ",".join(prim_ops),
         "--output_dir=${OUT}",
     ]
+
     # Here we generate the selected_prim_ops.h and the selected_operators.yaml file
     # both with single genrule
     genrule_cmd = genrule_cmd + [" && "] + prim_ops_genrule_cmd
@@ -307,7 +316,6 @@ def _prepare_genrule_and_lib(
     if support_exceptions:
         genrule_cmd.append("--add-exception-boundary")
 
-
     # Sources for generated kernel registration lib
     sources = MANUAL_REGISTRATION_SOURCES if manual_registration else GENERATED_SOURCES
 
@@ -371,7 +379,8 @@ def _prepare_custom_ops_genrule_and_lib(
         custom_ops_yaml_path = None,
         support_exceptions = True,
         deps = [],
-        kernels = []):
+        kernels = [],
+        platforms = get_default_executorch_platforms()):
     """Similar to _prepare_genrule_and_lib but for custom ops."""
     genrules = {}
     libs = {}
@@ -390,6 +399,7 @@ def _prepare_custom_ops_genrule_and_lib(
                    "--output_dir $OUT ").format(deps = " ".join(["\"{}\"".format(d) for d in deps])),
             outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
             default_outs = ["."],
+            platforms = platforms,
         )
 
         # genrule for generating operator kernel bindings
@@ -460,6 +470,7 @@ def exir_custom_ops_aot_lib(
         kernels = kernels,
         support_exceptions = support_exceptions,
         deps = deps,
+        platforms = platforms,
     )
     for genrule in genrules:
         runtime.genrule(
@@ -468,6 +479,7 @@ def exir_custom_ops_aot_lib(
             cmd = genrules[genrule]["cmd"],
             outs = genrules[genrule]["outs"],
             default_outs = ["."],
+            platforms = platforms,
         )
     for compiler_lib in libs:
         runtime.cxx_library(
@@ -538,7 +550,7 @@ def get_optimized_lib_deps():
         "//executorch/runtime/kernel:kernel_includes",
     ] + get_vec_deps()
 
-def build_portable_header_lib(name, oplist_header_name, feature = None):
+def build_portable_header_lib(name, oplist_header_name, feature = None, **kwargs):
     """Build the portable headers into a header-only library.
     Ensures that includes work across portable and optimized libs.
     """
@@ -546,21 +558,23 @@ def build_portable_header_lib(name, oplist_header_name, feature = None):
         name = name,
         srcs = [],
         exported_headers = {
-            "selected_op_variants.h":":{}[selected_op_variants]".format(oplist_header_name),
+            "selected_op_variants.h": ":{}[selected_op_variants]".format(oplist_header_name),
         },
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
         header_namespace = "",
         feature = feature,
+        **kwargs
     )
 
 def build_portable_lib(
-    name,
-    et_operator_lib_deps = [],
-    oplist_header_name = None,
-    portable_header_lib = None,
-    feature = None,
-    expose_operator_symbols = False,
-    visibility = ["@EXECUTORCH_CLIENTS"]):
+        name,
+        et_operator_lib_deps = [],
+        oplist_header_name = None,
+        portable_header_lib = None,
+        feature = None,
+        expose_operator_symbols = False,
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        platforms = get_default_executorch_platforms()):
     """
     WARNING: Before using this, please consider using executorch_generated_lib instead. This
     function is only for special cases where you need to build a portable kernel library with
@@ -639,9 +653,10 @@ def build_portable_lib(
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         feature = feature,
+        platforms = platforms,
     )
 
-def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False):
+def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False, platforms = get_default_executorch_platforms()):
     """Build optimized lib from source. We build from source so that the generated header file,
     selected_op_variants.h, can be used to selectively build the lib for different dtypes.
     """
@@ -661,7 +676,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     # Currently fbcode links all dependent libraries through shared
     # library, and it blocks users like unit tests to use kernel
     # implementation directly. So we enable this for xplat only.
-    compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed","-Wno-global-constructors","-Wno-shadow",]
+    compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed", "-Wno-global-constructors", "-Wno-shadow"]
     if not expose_operator_symbols and is_xplat():
         # Removing '-fvisibility=hidden' exposes operator symbols.
         # This allows operators to be called outside of the kernel registry.
@@ -674,6 +689,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
         deps = get_portable_lib_deps() + get_optimized_lib_deps() + [":" + portable_header_lib],
         compiler_flags = compiler_flags,
+        platforms = platforms,
         preprocessor_flags = get_vec_preprocessor_flags(),
         # sleef needs to be added as a direct dependency of the operator target when building for Android,
         # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
@@ -699,10 +715,9 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     )
 
 def selected_operators_genrule(
-    name,
-    deps,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        deps,
+        platforms = get_default_executorch_platforms()):
     """Generates selected_operators.yaml from the list of deps. We look into the trasitive closure of all the deps,
     and look for macros `et_operator_library`.
 
@@ -725,10 +740,9 @@ def selected_operators_genrule(
     )
 
 def selected_prim_operators_genrule(
-    name,
-    deps,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        deps,
+        platforms = get_default_executorch_platforms()):
     """Generates selected_prim_ops.h from the list of deps. We look into the transitive closure of all the deps,
     and look for targets with label `et_operator_library`.
 
@@ -750,12 +764,11 @@ def selected_prim_operators_genrule(
     )
 
 def dtype_header_genrule(
-    name,
-    visibility,
-    deps = [],
-    selected_operators_genrule_name = None,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        visibility,
+        deps = [],
+        selected_operators_genrule_name = None,
+        platforms = get_default_executorch_platforms()):
     """Generate selected_op_variants.h from selected_operators.yaml.
 
     Given a `selected_operators.yaml` (passed in as selected_operators_genrule_name), we should be able to determine
@@ -921,15 +934,14 @@ def executorch_generated_lib(
                 index = index + 1
                 portable = name + "_check_portable_" + dep.split(":")[1] + str(index)
                 message = "Dtype selective build requires that the portable library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
-                check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message)
+                check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message, platforms = platforms)
         if ("//executorch/kernels/optimized:optimized_operators" in kernel_deps):
             index = 0
             for dep in deps:
                 index = index + 1
                 optimized = name + "_check_optimized_" + dep.split(":")[1] + str(index)
                 message = "Dtype selective build requires that the optimized library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
-                check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message)
-
+                check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message, platforms = platforms)
 
     aten_suffix = "_aten" if aten_mode else ""
 
@@ -995,7 +1007,7 @@ def executorch_generated_lib(
     if dtype_selective_build:
         # Build portable headers lib. Used for portable and optimized kernel libraries.
         portable_header_lib = name + "_portable_header_lib"
-        build_portable_header_lib(portable_header_lib, oplist_header_name, feature)
+        build_portable_header_lib(portable_header_lib, oplist_header_name, feature, platforms = platforms)
 
         if "//executorch/kernels/portable:operators" in kernel_deps:
             # Remove portable from kernel_deps as we're building it from source.
@@ -1003,7 +1015,7 @@ def executorch_generated_lib(
 
             # Build portable lib.
             portable_lib_name = name + "_portable_lib"
-            build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols)
+            build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols, platforms = platforms)
             kernel_deps.append(":{}".format(portable_lib_name))
 
         if "//executorch/kernels/optimized:optimized_operators" in kernel_deps:
@@ -1012,7 +1024,7 @@ def executorch_generated_lib(
 
             # Build optimized lib.
             optimized_lib_name = name + "_optimized_lib"
-            build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
+            build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols, platforms = platforms)
             kernel_deps.append(":{}".format(optimized_lib_name))
 
     # Exports headers that declare the function signatures of the C++ functions
@@ -1111,10 +1123,9 @@ def executorch_generated_lib(
 #
 # If build successfully, all of the `selected_operators.yaml` will be merged into 1 `selected_operators.yaml` for debugging purpose.
 def executorch_ops_check(
-    name,
-    deps,
-    **kwargs,
-):
+        name,
+        deps,
+        **kwargs):
     runtime.genrule(
         name = name,
         macros_only = False,
@@ -1128,16 +1139,15 @@ def executorch_ops_check(
         platforms = kwargs.pop("platforms", get_default_executorch_platforms()),
         outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
         default_outs = ["."],
-        **kwargs,
+        **kwargs
     )
 
 def check_recursive_dependencies(
-    name,
-    parent,
-    child,
-    message = "",
-    **kwargs,
-):
+        name,
+        parent,
+        child,
+        message = "",
+        **kwargs):
     """
     Checks if child is a transitive dependency of parent and fails if it is.
     The query runs the equivalent of `buck2 uquery "allpaths(parent, child)".
diff --git a/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
index a5c89147801..73421f031ec 100644
--- a/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
+++ b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
@@ -28,6 +28,7 @@ def prim_ops_registry_selective(name, selected_prim_ops_header_target, aten_suff
             header_name: [header_name],
             "selected_prim_ops.h": ["selected_prim_ops.h"]
         },
+        platforms = kwargs.get("platforms", "CXX"),
         default_outs = ["."],
     )
     runtime.cxx_library(
diff --git a/src/executorch/examples/cuda b/src/executorch/examples/cuda
new file mode 120000
index 00000000000..aa2e50dd2cc
--- /dev/null
+++ b/src/executorch/examples/cuda
@@ -0,0 +1 @@
+../../../examples/cuda
\ No newline at end of file
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 1648f2ba434..5166d454e60 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -41,6 +41,7 @@ build_executorch() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
diff --git a/third-party/ao b/third-party/ao
index b47f1a36550..01849b2b19c 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit b47f1a3655004b2b4dd3b4f01a5d8eebff1faa3c
+Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715
diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake
index 2a6bf42b48a..32d3d8b554f 100644
--- a/tools/cmake/Codegen.cmake
+++ b/tools/cmake/Codegen.cmake
@@ -399,6 +399,7 @@ function(executorch_load_build_variables)
       EXTENSION_EVALUE_UTIL_SRCS
       EXTENSION_FLAT_TENSOR_SRCS
       EXTENSION_MODULE_SRCS
+      EXTENSION_NAMED_DATA_MAP_SRCS
       EXTENSION_RUNNER_UTIL_SRCS
       EXTENSION_LLM_RUNNER_SRCS
       EXTENSION_TENSOR_SRCS
@@ -431,6 +432,7 @@ function(executorch_load_build_variables)
       _extension_evalue_util__srcs
       _extension_flat_tensor__srcs
       _extension_module__srcs
+      _extension_named_data_map__srcs
       _extension_runner_util__srcs
       _extension_llm_runner__srcs
       _extension_tensor__srcs
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 6c27e8ba616..ba18aede63e 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -53,6 +53,7 @@ set(EXECUTORCH_FOUND ON)
 include("${CMAKE_CURRENT_LIST_DIR}/ExecuTorchTargets.cmake")
 
 set(optional_lib_list
+    aoti_cuda
     flatccrt
     etdump
     bundled_program
@@ -84,6 +85,7 @@ set(optional_lib_list
     quantized_kernels
     quantized_ops_lib
     quantized_ops_aot_lib
+    openvino_backend
     torchao_ops_executorch
     torchao_kernels_aarch64
 )
diff --git a/tools/cmake/preset/android.cmake b/tools/cmake/preset/android.cmake
index d794e8fcef3..5c9bc97e3ef 100644
--- a/tools/cmake/preset/android.cmake
+++ b/tools/cmake/preset/android.cmake
@@ -23,6 +23,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
diff --git a/tools/cmake/preset/apple_common.cmake b/tools/cmake/preset/apple_common.cmake
index 7b4ec420996..27ec35aa43e 100644
--- a/tools/cmake/preset/apple_common.cmake
+++ b/tools/cmake/preset/apple_common.cmake
@@ -28,6 +28,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/arm_baremetal.cmake b/tools/cmake/preset/arm_baremetal.cmake
index 33a12969484..882780ade1d 100644
--- a/tools/cmake/preset/arm_baremetal.cmake
+++ b/tools/cmake/preset/arm_baremetal.cmake
@@ -5,6 +5,8 @@
 
 set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}")
 set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
 set_overridable_option(EXECUTORCH_BUILD_ARM_BAREMETAL ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
@@ -18,7 +20,6 @@ define_overridable_option(
 if("${EXECUTORCH_BUILD_ARM_ETDUMP}")
   set(EXECUTORCH_BUILD_DEVTOOLS ON)
   set(EXECUTORCH_ENABLE_EVENT_TRACER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
   set(FLATCC_ALLOW_WERROR OFF)
 else()
   set(EXECUTORCH_ENABLE_EVENT_TRACER OFF)
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index fb0dc0a4ade..04e84622589 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -67,11 +67,11 @@ define_overridable_option(
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension" BOOL OFF
@@ -86,6 +86,10 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP
+  "Build the Named Data Map extension" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" BOOL OFF
 )
@@ -145,6 +149,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF
 )
@@ -176,6 +183,36 @@ define_overridable_option(
   ${_default_executorch_build_cpuinfo}
 )
 
+# Threadpool size options. At most one can be specified. Note that the default
+# is managed in threadpool.cpp to allow the user to specify an alternate mode
+# without needing to explicitly set the default to off.
+define_overridable_option(
+  EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES
+  "Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores."
+  BOOL
+  OFF
+)
+define_overridable_option(
+  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
+  "Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores."
+  BOOL
+  OFF
+)
+
+check_required_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES
+  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
+)
+check_required_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES
+  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
+)
+
+check_conflicting_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH
+  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
+)
+
 # TODO(jathu): move this to platform specific presets when created
 set(_default_executorch_build_executor_runner ON)
 if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")
@@ -277,6 +314,12 @@ check_required_options_on(
 check_required_options_on(
   IF_ON EXECUTORCH_BUILD_EXTENSION_MODULE REQUIRES
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR
+  EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP
+)
+
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP REQUIRES
+  EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
 )
 
 check_required_options_on(
@@ -342,6 +385,10 @@ check_required_options_on(
   EXECUTORCH_BUILD_EXTENSION_LLM
 )
 
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
+)
+
 if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
   message(
     FATAL_ERROR
diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake
index e29fc7c4287..6cd2482f717 100644
--- a/tools/cmake/preset/llm.cmake
+++ b/tools/cmake/preset/llm.cmake
@@ -10,6 +10,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/profiling.cmake b/tools/cmake/preset/profiling.cmake
index a73c340078c..640a84b261c 100644
--- a/tools/cmake/preset/profiling.cmake
+++ b/tools/cmake/preset/profiling.cmake
@@ -9,6 +9,7 @@
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake
index f98e68ef5ac..c71c10ad01f 100644
--- a/tools/cmake/preset/pybind.cmake
+++ b/tools/cmake/preset/pybind.cmake
@@ -17,10 +17,11 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM_AOT ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 
 # TODO(larryliu0820): Temporarily disable building llm_runner for Windows wheel
 # due to the issue of tokenizer file path length limitation.
diff --git a/tools/cmake/preset/windows.cmake b/tools/cmake/preset/windows.cmake
index b75a5af578e..5123dfc956d 100644
--- a/tools/cmake/preset/windows.cmake
+++ b/tools/cmake/preset/windows.cmake
@@ -10,6 +10,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/torch_pin.py b/torch_pin.py
index 1b89309ad05..bb8d32d4716 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,19 @@
+# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
+# pip versions will have the required features.
+#
+# NOTE: If a newly-fetched version of the executorch repo changes the value of
+# NIGHTLY_VERSION, you should re-run install_executorch.sh script to install the necessary
+# package versions.
+#
+# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
+# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
+#
+# NOTE: If you're changing, make the corresponding supported CUDA versions in
+# SUPPORTED_CUDA_VERSIONS above if needed.
 TORCH_VERSION = "2.10.0"
-NIGHTLY_VERSION = "dev20250915"
+NIGHTLY_VERSION = "dev20251004"
+SUPPORTED_CUDA_VERSIONS = (
+    (12, 6),
+    (12, 8),
+    (13, 0),
+)