From 3f125b67e1f086439adf45cd3c49ae2eae14a7cb Mon Sep 17 00:00:00 2001
From: Jakob Degen <jakobdegen@meta.com>
Date: Wed, 8 May 2024 15:32:21 -0700
Subject: [PATCH 01/62] Update `sorted_vector_map`

Summary: The most recent fbsource version was just released onto crates.io, so builds are failing until we update

Reviewed By: iguridi

Differential Revision: D57091891

fbshipit-source-id: b7a44252e42ad72f81378e50cd404818b454fcbf
---
 shim/third-party/rust/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shim/third-party/rust/Cargo.toml b/shim/third-party/rust/Cargo.toml
index 1084abbddc8..b2de7475bae 100644
--- a/shim/third-party/rust/Cargo.toml
+++ b/shim/third-party/rust/Cargo.toml
@@ -22,7 +22,7 @@ path = "top/main.rs"
 
 gazebo = {version = "0.8.1", features = ["str_pattern_extensions"]}
 fbinit = "0.1"
-sorted_vector_map = "0.1"
+sorted_vector_map = "0.2"
 watchman_client = "0.8.0"
 
 annotate-snippets = { version = "0.9.0", features = ["color"] }

From 25214d4766c55ed8661ce9a97a38d0a616dc118c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 8 May 2024 16:19:28 -0700
Subject: [PATCH 02/62] Use compile-time promotion to reduce max/min size &
 build time (#3459)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3459

Yet another smaller pair of ops.

Reviewed By: manuelcandales

Differential Revision: D56807402

fbshipit-source-id: 04a4a57df88cc1734243fd5c4ef20d1b7fc02a76
---
 kernels/portable/cpu/op_maximum.cpp | 68 ++++++++++++++++++++++------
 kernels/portable/cpu/op_minimum.cpp | 69 ++++++++++++++++++++++-------
 2 files changed, 108 insertions(+), 29 deletions(-)

diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 3e34035d5f6..4091f2cf8ca 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -20,6 +20,50 @@ const T& max(const T& a, const T& b) {
   return (b > a) ? b : a;
 }
 
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = max(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
 } // namespace
 
 Tensor& maximum_out(
@@ -44,20 +88,16 @@ Tensor& maximum_out(
 
   ET_SWITCH_REALHB_TYPES(a_type, ctx, "maximum.out", CTYPE_A, [&]() {
     ET_SWITCH_REALHB_TYPES(b_type, ctx, "maximum.out", CTYPE_B, [&]() {
-      ET_SWITCH_REALB_TYPES(common_type, ctx, "maximum.out", CTYPE_IN, [&]() {
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() {
-          apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-              [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                CTYPE_IN value = max(a_casted, b_casted);
-
-                return static_cast<CTYPE_OUT>(value);
-              },
-              a,
-              b,
-              out);
-        });
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() {
+        MaximumInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
       });
     });
   });
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index 767a2c4ca59..7c106a63c4f 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -20,6 +20,50 @@ const T& min(const T& a, const T& b) {
   return (b < a) ? b : a;
 }
 
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = min(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
 } // namespace
 
 Tensor& minimum_out(
@@ -44,22 +88,17 @@ Tensor& minimum_out(
 
   ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "minimum.out", CTYPE_A, [&]() {
     ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "minimum.out", CTYPE_B, [&]() {
+      using CTYPE_IN =
+          typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
       ET_SWITCH_REAL_TYPES_AND(
-          Bool, common_type, ctx, "minimum.out", CTYPE_IN, [&]() {
-            ET_SWITCH_REAL_TYPES_AND(
-                Bool, out_type, ctx, "minimum.out", CTYPE_OUT, [&]() {
-                  apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                      [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                        CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                        CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                        CTYPE_IN value = min(a_casted, b_casted);
-
-                        return static_cast<CTYPE_OUT>(value);
-                      },
-                      a,
-                      b,
-                      out);
-                });
+          Bool, out_type, ctx, "minimum.out", CTYPE_OUT, [&]() {
+            MinimumInner<
+                can_cast<CTYPE_IN, CTYPE_OUT>::value,
+                CTYPE_A,
+                CTYPE_B,
+                CTYPE_IN,
+                CTYPE_OUT>::run(a, b, out);
           });
     });
   });

From a2e13101b99720ce33a770aa7374f7b87d1ed279 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 8 May 2024 16:19:28 -0700
Subject: [PATCH 03/62] Use compile-time promotion to reduce floor_divide size
 & build time (#3455)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3455

Continuing rollout of this technique.

Reviewed By: manuelcandales

Differential Revision: D56827786

fbshipit-source-id: ede23e0377b9d70e0f378c3a5342c3bc1c9cd09b
---
 kernels/portable/cpu/op_floor_divide.cpp      | 93 +++++++++++++------
 .../core/exec_aten/util/scalar_type_util.h    |  6 ++
 2 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp
index 261f77ce617..0514df0ca25 100644
--- a/kernels/portable/cpu/op_floor_divide.cpp
+++ b/kernels/portable/cpu/op_floor_divide.cpp
@@ -20,6 +20,60 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct FloorDivideInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct FloorDivideInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) {
+          if (is_integral_type<CTYPE_IN, /*includeBool=*/true>::value) {
+            if (val_b == 0) {
+              div_by_zero_error = true;
+              return static_cast<CTYPE_OUT>(0);
+            }
+          }
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = utils::floor_divide<CTYPE_IN>(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&, bool&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct FloorDivideInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
 Tensor& floor_divide_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -46,36 +100,17 @@ Tensor& floor_divide_out(
       Bool, a_type, ctx, "floor_divide.out", CTYPE_A, [&]() {
         ET_SWITCH_REAL_TYPES_AND(
             Bool, b_type, ctx, "floor_divide.out", CTYPE_B, [&]() {
+              using CTYPE_IN = typename torch::executor::
+                  promote_types<CTYPE_A, CTYPE_B>::type;
+              ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
               ET_SWITCH_REAL_TYPES(
-                  common_type, ctx, "floor_divide.out", CTYPE_IN, [&]() {
-                    ET_SWITCH_REAL_TYPES(
-                        out_type, ctx, "floor_divide.out", CTYPE_OUT, [&]() {
-                          apply_binary_elementwise_fn<
-                              CTYPE_A,
-                              CTYPE_B,
-                              CTYPE_OUT>(
-                              [common_type, &div_by_zero_error](
-                                  const CTYPE_A val_a, const CTYPE_B val_b) {
-                                if (isIntegralType(
-                                        common_type, /*includeBool=*/true)) {
-                                  if (val_b == 0) {
-                                    div_by_zero_error = true;
-                                    return static_cast<CTYPE_OUT>(0);
-                                  }
-                                }
-                                CTYPE_IN a_casted =
-                                    static_cast<CTYPE_IN>(val_a);
-                                CTYPE_IN b_casted =
-                                    static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value = utils::floor_divide<CTYPE_IN>(
-                                    a_casted, b_casted);
-
-                                return static_cast<CTYPE_OUT>(value);
-                              },
-                              a,
-                              b,
-                              out);
-                        });
+                  out_type, ctx, "floor_divide.out", CTYPE_OUT, [&]() {
+                    FloorDivideInner<
+                        can_cast<CTYPE_IN, CTYPE_OUT>::value,
+                        CTYPE_A,
+                        CTYPE_B,
+                        CTYPE_IN,
+                        CTYPE_OUT>::run(a, b, out, div_by_zero_error);
                   });
             });
       });
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index 595ed7a1c02..084289520aa 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -349,6 +349,12 @@ inline constexpr bool isIntegralType(
        t == exec_aten::ScalarType::Short);
 }
 
+template <typename T, bool includeBool>
+struct is_integral_type
+    : public std::integral_constant<
+          bool,
+          isIntegralType(CppTypeToScalarType<T>::value, includeBool)> {};
+
 inline constexpr bool isFloatingType(exec_aten::ScalarType t) {
   return (
       t == exec_aten::ScalarType::Double || t == exec_aten::ScalarType::Float ||

From ad33982c604b9bb3344ac12a8e013689930da5f2 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 8 May 2024 16:19:28 -0700
Subject: [PATCH 04/62] Use compile-time promotion to reduce remainder size &
 build time (#3458)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3458

Yet another op that can benefit from compile-time type promotion.

Reviewed By: manuelcandales

Differential Revision: D56831293

fbshipit-source-id: ff79870512e3baaaaeb08a311cb5bf323ebfbe19
---
 kernels/portable/cpu/op_remainder.cpp | 81 ++++++++++++++++++---------
 kernels/test/op_remainder_test.cpp    | 14 +++++
 2 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp
index 9e48374a81a..7c858c1c08a 100644
--- a/kernels/portable/cpu/op_remainder.cpp
+++ b/kernels/portable/cpu/op_remainder.cpp
@@ -20,6 +20,52 @@ namespace native {
 
 using Tensor = exec_aten::Tensor;
 
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct RemainderInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct RemainderInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = utils::remainder_override(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct RemainderInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
 Tensor& remainder_Tensor_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -45,32 +91,17 @@ Tensor& remainder_Tensor_out(
       Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() {
         ET_SWITCH_REAL_TYPES_AND(
             Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() {
+              using CTYPE_IN = typename torch::executor::
+                  promote_types<CTYPE_A, CTYPE_B>::type;
+              ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
               ET_SWITCH_REAL_TYPES(
-                  common_type, ctx, "remainder.Tensor_out", CTYPE_IN, [&]() {
-                    ET_SWITCH_REAL_TYPES(
-                        out_type,
-                        ctx,
-                        "remainder.Tensor_out",
-                        CTYPE_OUT,
-                        [&]() {
-                          apply_binary_elementwise_fn<
-                              CTYPE_A,
-                              CTYPE_B,
-                              CTYPE_OUT>(
-                              [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                                CTYPE_IN a_casted =
-                                    static_cast<CTYPE_IN>(val_a);
-                                CTYPE_IN b_casted =
-                                    static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value = utils::remainder_override(
-                                    a_casted, b_casted);
-
-                                return static_cast<CTYPE_OUT>(value);
-                              },
-                              a,
-                              b,
-                              out);
-                        });
+                  out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() {
+                    RemainderInner<
+                        can_cast<CTYPE_IN, CTYPE_OUT>::value,
+                        CTYPE_A,
+                        CTYPE_B,
+                        CTYPE_IN,
+                        CTYPE_OUT>::run(a, b, out);
                   });
             });
       });
diff --git a/kernels/test/op_remainder_test.cpp b/kernels/test/op_remainder_test.cpp
index 4a550958a1a..254e8122b61 100644
--- a/kernels/test/op_remainder_test.cpp
+++ b/kernels/test/op_remainder_test.cpp
@@ -21,6 +21,7 @@ using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
 class OpRemainderOutTest : public OperatorTest {
+ protected:
   Tensor& op_remainder_tensor_out(
       const Tensor& self,
       const Tensor& other,
@@ -35,3 +36,16 @@ class OpRemainderOutTest : public OperatorTest {
     return torch::executor::aten::remainder_outf(context_, self, other, out);
   }
 };
+
+TEST_F(OpRemainderOutTest, SmokeTest) {
+  TensorFactory<ScalarType::Long> tfDouble;
+  TensorFactory<ScalarType::Long> tfLong;
+  TensorFactory<ScalarType::Int> tfInt;
+
+  Tensor self = tfLong.full({2, 2}, 46);
+  Tensor other = tfInt.full({2, 2}, 4);
+  Tensor out = tfDouble.zeros({2, 2});
+  Tensor out_expected = tfDouble.full({2, 2}, 2.0);
+  op_remainder_tensor_out(self, other, out);
+  EXPECT_TENSOR_CLOSE(out, out_expected);
+}

From 5c05deffe10907c830665a73a732d0c08f954dbf Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 8 May 2024 16:19:28 -0700
Subject: [PATCH 05/62] Use compile-time promotion to reduce fmod size & build
 time (#3456)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3456

Almost done with Tensor ops that can benefit from compile-time promotion!

Reviewed By: manuelcandales

Differential Revision: D56835200

fbshipit-source-id: af3fb1723fd2488c44287ada01764d7bcddd6728
---
 kernels/portable/cpu/op_fmod.cpp | 93 ++++++++++++++++++++++----------
 kernels/test/op_fmod_test.cpp    | 13 +++++
 2 files changed, 78 insertions(+), 28 deletions(-)

diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
index 0083c1379d5..42f83731199 100644
--- a/kernels/portable/cpu/op_fmod.cpp
+++ b/kernels/portable/cpu/op_fmod.cpp
@@ -19,6 +19,60 @@ namespace native {
 
 using Tensor = exec_aten::Tensor;
 
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct FmodInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct FmodInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) {
+          if (is_integral_type<CTYPE_IN, /*includeBool=*/true>::value) {
+            if (val_b == 0) {
+              div_by_zero_error = true;
+              return static_cast<CTYPE_OUT>(0);
+            }
+          }
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = std::fmod(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&, bool&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct FmodInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
 Tensor& fmod_Tensor_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -44,35 +98,18 @@ Tensor& fmod_Tensor_out(
       Bool, a_type, ctx, "fmod.Tensor_out", CTYPE_A, [&]() {
         ET_SWITCH_REAL_TYPES_AND(
             Bool, b_type, ctx, "fmod.Tensor_out", CTYPE_B, [&]() {
+              using CTYPE_IN = typename torch::executor::
+                  promote_types<CTYPE_A, CTYPE_B>::type;
+              ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
               ET_SWITCH_REAL_TYPES(
-                  common_type, ctx, "fmod.Tensor_out", CTYPE_IN, [&]() {
-                    ET_SWITCH_REAL_TYPES(
-                        out_type, ctx, "fmod.Tensor_out", CTYPE_OUT, [&]() {
-                          apply_binary_elementwise_fn<
-                              CTYPE_A,
-                              CTYPE_B,
-                              CTYPE_OUT>(
-                              [common_type, &div_by_zero_error](
-                                  const CTYPE_A val_a, const CTYPE_B val_b) {
-                                if (isIntegralType(
-                                        common_type, /*includeBool=*/true)) {
-                                  if (val_b == 0) {
-                                    div_by_zero_error = true;
-                                    return static_cast<CTYPE_OUT>(0);
-                                  }
-                                }
-                                CTYPE_IN a_casted =
-                                    static_cast<CTYPE_IN>(val_a);
-                                CTYPE_IN b_casted =
-                                    static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value = std::fmod(a_casted, b_casted);
-
-                                return static_cast<CTYPE_OUT>(value);
-                              },
-                              a,
-                              b,
-                              out);
-                        });
+                  out_type, ctx, "fmod.Tensor_out", CTYPE_OUT, [&]() {
+                    FmodInner<
+                        !std::is_same<CTYPE_IN, bool>::value &&
+                            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+                        CTYPE_A,
+                        CTYPE_B,
+                        CTYPE_IN,
+                        CTYPE_OUT>::run(a, b, out, div_by_zero_error);
                   });
             });
       });
diff --git a/kernels/test/op_fmod_test.cpp b/kernels/test/op_fmod_test.cpp
index 475d4ea5cb4..4ee4d84c1cc 100644
--- a/kernels/test/op_fmod_test.cpp
+++ b/kernels/test/op_fmod_test.cpp
@@ -32,3 +32,16 @@ class OpFmodTest : public OperatorTest {
     return torch::executor::aten::fmod_outf(context_, self, other, out);
   }
 };
+
+TEST_F(OpFmodTest, SmokeTest) {
+  TensorFactory<ScalarType::Long> tfDouble;
+  TensorFactory<ScalarType::Long> tfLong;
+  TensorFactory<ScalarType::Int> tfInt;
+
+  Tensor self = tfLong.full({2, 2}, 46);
+  Tensor other = tfInt.full({2, 2}, 4);
+  Tensor out = tfDouble.zeros({2, 2});
+  Tensor out_expected = tfDouble.full({2, 2}, 2.0);
+  op_fmod_tensor_out(self, other, out);
+  EXPECT_TENSOR_CLOSE(out, out_expected);
+}

From 7841e96c7a37e82fd2d8aad037093f9d4d1068ee Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 8 May 2024 16:19:28 -0700
Subject: [PATCH 06/62] support Half in minimum and clamp (#3457)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3457

IIUC, these ops need to support Half but don't. Noticed it as a difference from maximum.

Reviewed By: manuelcandales

Differential Revision: D56846242

fbshipit-source-id: 6b5f85ee77ac6078ae2e82ad1f1944c5d5104340
---
 kernels/portable/cpu/op_clamp.cpp     | 18 ++++++------
 kernels/portable/cpu/op_minimum.cpp   | 27 +++++++++--------
 kernels/portable/cpu/util/math_util.h | 42 +++++++++++++++++++++++++++
 kernels/test/op_clamp_test.cpp        | 25 +++++++++++++---
 kernels/test/op_minimum_test.cpp      |  4 +++
 5 files changed, 89 insertions(+), 27 deletions(-)

diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index 06c87d03f2d..50d7e8c374d 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -53,7 +53,7 @@ __ET_NODISCARD bool check_bounds(
         }
       });
     } else if (isFloatingType(out_type)) {
-      ET_SWITCH_FLOAT_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() {
+      ET_SWITCH_FLOATH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() {
         if (std::isfinite(val) &&
             is_out_of_bounds<CTYPE_VAL, CTYPE_OUT, double>(val)) {
           ET_LOG(Error, "%s value out of bounds", val_name);
@@ -119,7 +119,7 @@ Tensor& clamp_out(
 
   ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
 
-  ET_SWITCH_REAL_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() {
+  ET_SWITCH_REALH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() {
     // Extract optional min value
     CTYPE_OUT min = 0;
     if (has_min) {
@@ -140,7 +140,7 @@ Tensor& clamp_out(
       });
     }
 
-    ET_SWITCH_REAL_TYPES_AND(Bool, in_type, ctx, "clamp", CTYPE_IN, [&]() {
+    ET_SWITCH_REALHB_TYPES(in_type, ctx, "clamp", CTYPE_IN, [&]() {
       apply_unary_map_fn(
           [has_min, min, has_max, max](const CTYPE_IN val_in) {
             CTYPE_OUT val_out = static_cast<CTYPE_OUT>(val_in);
@@ -195,20 +195,20 @@ Tensor& clamp_tensor_out(
   ScalarType out_type = out.scalar_type();
 
   if (has_min) {
-    common_type = promoteTypes(common_type, min_type);
+    common_type = promoteTypes(common_type, min_type, /*half_to_float*/ true);
   }
   if (has_max) {
-    common_type = promoteTypes(common_type, max_type);
+    common_type = promoteTypes(common_type, max_type, /*half_to_float*/ true);
   }
 
   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
   constexpr auto name = "clamp.Tensor_out";
 
-  ET_SWITCH_REALB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() {
-    ET_SWITCH_REALB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() {
-      ET_SWITCH_REALB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() {
-        ET_SWITCH_REALB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+  ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() {
+    ET_SWITCH_REALHB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() {
+      ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() {
+        ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
           apply_ternary_elementwise_fn<
               CTYPE_IN,
               CTYPE_MIN,
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index 7c106a63c4f..44c0efa8a67 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -81,25 +81,24 @@ Tensor& minimum_out(
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
-  ScalarType common_type = promoteTypes(a_type, b_type);
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
   ScalarType out_type = out.scalar_type();
 
   ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
 
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "minimum.out", CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "minimum.out", CTYPE_B, [&]() {
-      using CTYPE_IN =
-          typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, "minimum.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, "minimum.out", CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
       ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      ET_SWITCH_REAL_TYPES_AND(
-          Bool, out_type, ctx, "minimum.out", CTYPE_OUT, [&]() {
-            MinimumInner<
-                can_cast<CTYPE_IN, CTYPE_OUT>::value,
-                CTYPE_A,
-                CTYPE_B,
-                CTYPE_IN,
-                CTYPE_OUT>::run(a, b, out);
-          });
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "minimum.out", CTYPE_OUT, [&]() {
+        MinimumInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
     });
   });
 
diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h
index 44cb47f8cba..df175147062 100644
--- a/kernels/portable/cpu/util/math_util.h
+++ b/kernels/portable/cpu/util/math_util.h
@@ -94,6 +94,48 @@ INT_T max_override(INT_T a, INT_T b) {
   return std::max(a, b);
 }
 
+template <
+    typename T,
+    typename std::enable_if<
+        std::is_same<T, torch::executor::Half>::value,
+        bool>::type = true>
+T min_override(T a, T b) {
+  const auto float_a = static_cast<float>(a);
+  if (std::isnan(float_a)) {
+    return a;
+  }
+  const auto float_b = static_cast<float>(b);
+  if (std::isnan(float_b)) {
+    return b;
+  }
+
+  if (float_a < float_b) {
+    return a;
+  }
+  return b;
+}
+
+template <
+    typename T,
+    typename std::enable_if<
+        std::is_same<T, torch::executor::Half>::value,
+        bool>::type = true>
+T max_override(T a, T b) {
+  const auto float_a = static_cast<float>(a);
+  if (std::isnan(float_a)) {
+    return a;
+  }
+  const auto float_b = static_cast<float>(b);
+  if (std::isnan(float_b)) {
+    return b;
+  }
+
+  if (float_a > float_b) {
+    return a;
+  }
+  return b;
+}
+
 /**
  * There is a slight difference in how std::fmod works compared to how ATen
  * determines remainders:
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index 871333482c8..0244fd55700 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -147,8 +147,16 @@ class OpClampOutTest : public OperatorTest {
   // Test cases that are compatible with float and double.
   template <ScalarType DTYPE>
   void run_floating_point_test_cases() {
-    constexpr auto kInfinity =
-        std::numeric_limits<typename TensorFactory<DTYPE>::ctype>::infinity();
+    using ctype = typename TensorFactory<DTYPE>::ctype;
+    using opt_infinity_type = std::conditional_t<
+        std::is_same<ctype, exec_aten::Half>::value,
+        float,
+        ctype>;
+    constexpr auto kInfinity = std::numeric_limits<ctype>::infinity();
+    const auto kOptInfinity =
+        OptScalar(static_cast<opt_infinity_type>(kInfinity));
+    const auto kOptMinusInfinity =
+        OptScalar(static_cast<opt_infinity_type>(-kInfinity));
     std::vector<ClampTestCase<DTYPE>> test_cases = {
         {
             std::string(__func__) + ": Simple negative/positive clamp",
@@ -178,7 +186,7 @@ class OpClampOutTest : public OperatorTest {
             std::string(__func__) + ": Infinite min",
             {2, 2}, // sizes
             {-10.1, -1.1, 1.1, 10.1}, // input_data
-            OptScalar(-kInfinity), // min
+            kOptMinusInfinity, // min
             OptScalar(5.5), // max
             {-10.1, -1.1, 1.1, 5.5}, // expected_data
         },
@@ -187,7 +195,7 @@ class OpClampOutTest : public OperatorTest {
             {2, 2}, // sizes
             {-10.1, -1.1, 1.1, 10.1}, // input_data
             OptScalar(-5.5), // min
-            OptScalar(kInfinity), // max
+            kOptInfinity, // max
             {-5.5, -1.1, 1.1, 10.1}, // expected_data
         },
         {
@@ -285,6 +293,15 @@ TEST_F(OpClampOutTest, LongTensors) {
   run_signed_integer_test_cases<ScalarType::Long>();
 }
 
+TEST_F(OpClampOutTest, HalfTensors) {
+  // Note that the integer test cases test the situation where the min/max value
+  // Scalars are integer types, demonstrating that floating point types can be
+  // clamped to integer values.
+  run_unsigned_integer_test_cases<ScalarType::Half>();
+  run_signed_integer_test_cases<ScalarType::Half>();
+  run_floating_point_test_cases<ScalarType::Half>();
+}
+
 TEST_F(OpClampOutTest, FloatTensors) {
   // Note that the integer test cases test the situation where the min/max value
   // Scalars are integer types, demonstrating that floating point types can be
diff --git a/kernels/test/op_minimum_test.cpp b/kernels/test/op_minimum_test.cpp
index be43e0af07d..7e12374b8d1 100644
--- a/kernels/test/op_minimum_test.cpp
+++ b/kernels/test/op_minimum_test.cpp
@@ -65,6 +65,10 @@ TEST_F(OpMinimumOutTest, LongTensors) {
   test_minimum_out_same_size<ScalarType::Long>();
 }
 
+TEST_F(OpMinimumOutTest, HalfTensors) {
+  test_minimum_out_same_size<ScalarType::Half>();
+}
+
 TEST_F(OpMinimumOutTest, FloatTensors) {
   test_minimum_out_same_size<ScalarType::Float>();
 }

From c8f306d500fc5fd9a891e5d6a3ea3fbd9f5c6880 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Wed, 8 May 2024 16:31:56 -0700
Subject: [PATCH 07/62] Stylize struct members with snake_case (#3545)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3545

Because the current mix of camelCase and snake_case is inconsistent.
ghstack-source-id: 225583077
exported-using-ghexport
bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: SS-JIA

Differential Revision: D57080712

fbshipit-source-id: c3a6a9a5c533c0f4ac5c28f0a8a0e4333d29f5ba
---
 backends/vulkan/runtime/VulkanBackend.cpp     | 14 ++---
 .../vulkan/runtime/VulkanDelegateHeader.cpp   | 18 +++----
 .../vulkan/runtime/VulkanDelegateHeader.h     |  8 +--
 backends/vulkan/runtime/api/Command.cpp       |  4 +-
 backends/vulkan/runtime/api/Command.h         |  4 +-
 backends/vulkan/runtime/api/Context.cpp       | 16 +++---
 backends/vulkan/runtime/api/Context.h         | 12 ++---
 backends/vulkan/runtime/api/Descriptor.cpp    | 14 ++---
 backends/vulkan/runtime/api/Descriptor.h      | 12 ++---
 backends/vulkan/runtime/api/QueryPool.cpp     | 10 ++--
 backends/vulkan/runtime/api/QueryPool.h       |  4 +-
 backends/vulkan/runtime/api/Runtime.cpp       | 24 ++++-----
 backends/vulkan/runtime/api/Runtime.h         |  8 +--
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 54 +++++++++----------
 backends/vulkan/runtime/graph/GraphConfig.cpp | 53 +++++++++---------
 backends/vulkan/runtime/graph/GraphConfig.h   | 16 +++---
 .../vulkan/test/op_tests/utils/codegen.py     |  4 +-
 17 files changed, 137 insertions(+), 138 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index c1f3f06b440..1c754599678 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -48,7 +48,7 @@ using BytesVector =
     const flatbuffers::Vector<flatbuffers::Offset<vkgraph::VkBytes>>*;
 using UIntVector = const flatbuffers::Vector<uint32_t>*;
 
-const uint8_t* getConstantDataPtr(
+const uint8_t* get_constant_data_ptr(
     VkGraphPtr flatbuffer_graph,
     const int32_t buffer_idx,
     const uint8_t* constant_data) {
@@ -111,19 +111,19 @@ GraphConfig get_graph_config(ArrayRef<CompileSpec>& compile_specs) {
     const size_t value_size = spec.value.nbytes;
     if (strcmp(spec.key, "storage_type_override") == 0) {
       ET_CHECK_MSG(value_size == sizeof(int32_t), "Unexpected value size!");
-      int value_as_int = static_cast<int>(GetUInt32LE(value_data));
+      int value_as_int = static_cast<int>(getUInt32LE(value_data));
       api::StorageType storage_type =
           static_cast<api::StorageType>(value_as_int);
 
-      config.setStorageTypeOverride(storage_type);
+      config.set_storage_type_override(storage_type);
     }
     if (strcmp(spec.key, "memory_layout_override") == 0) {
       ET_CHECK_MSG(value_size == sizeof(uint32_t), "Unexpected value size!");
-      uint32_t value_as_int = GetUInt32LE(value_data);
+      uint32_t value_as_int = getUInt32LE(value_data);
       api::GPUMemoryLayout memory_layout =
           static_cast<api::GPUMemoryLayout>(value_as_int);
 
-      config.setMemoryLayoutOverride(memory_layout);
+      config.set_memory_layout_override(memory_layout);
     }
   }
   return config;
@@ -181,7 +181,7 @@ class GraphBuilder {
 
     ValueRef ref;
     if (tensor_fb->constant_id() >= 0) {
-      const uint8_t* tensor_data = getConstantDataPtr(
+      const uint8_t* tensor_data = get_constant_data_ptr(
           flatbuffer_, tensor_fb->constant_id(), constant_data_);
 
       ref = compute_graph_->add_tensorref(dims_vector, dtype, tensor_data);
@@ -399,7 +399,7 @@ class VulkanBackend final : public PyTorchBackendInterface {
   __ET_NODISCARD Error
   compileModel(const void* buffer_pointer, ComputeGraph* compute_graph) const {
     Result<VulkanDelegateHeader> header =
-        VulkanDelegateHeader::Parse(buffer_pointer);
+        VulkanDelegateHeader::parse(buffer_pointer);
 
     const uint8_t* flatbuffer_data = nullptr;
     const uint8_t* constant_data = nullptr;
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.cpp b/backends/vulkan/runtime/VulkanDelegateHeader.cpp
index 4415996a648..a9a9fa849a7 100644
--- a/backends/vulkan/runtime/VulkanDelegateHeader.cpp
+++ b/backends/vulkan/runtime/VulkanDelegateHeader.cpp
@@ -39,7 +39,7 @@ constexpr ByteSlice kBytesSize = {22, 8};
 } // namespace
 
 /// Interprets the 8 bytes at `data` as a little-endian uint64_t.
-uint64_t GetUInt64LE(const uint8_t* data) {
+uint64_t getUInt64LE(const uint8_t* data) {
   return (uint64_t)data[0] | ((uint64_t)data[1] << 8) |
       ((uint64_t)data[2] << 16) | ((uint64_t)data[3] << 24) |
       ((uint64_t)data[4] << 32) | ((uint64_t)data[5] << 40) |
@@ -47,13 +47,13 @@ uint64_t GetUInt64LE(const uint8_t* data) {
 }
 
 /// Interprets the 4 bytes at `data` as a little-endian uint32_t.
-uint32_t GetUInt32LE(const uint8_t* data) {
+uint32_t getUInt32LE(const uint8_t* data) {
   return (uint32_t)data[0] | ((uint32_t)data[1] << 8) |
       ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 24);
 }
 
 /// Interprets the 2 bytes at `data` as a little-endian uint32_t.
-uint32_t GetUInt16LE(const uint8_t* data) {
+uint32_t getUInt16LE(const uint8_t* data) {
   return (uint32_t)data[0] | ((uint32_t)data[1] << 8);
 }
 
@@ -77,7 +77,7 @@ bool VulkanDelegateHeader::is_valid() const {
   return true;
 }
 
-Result<VulkanDelegateHeader> VulkanDelegateHeader::Parse(const void* data) {
+Result<VulkanDelegateHeader> VulkanDelegateHeader::parse(const void* data) {
   const uint8_t* header_data = (const uint8_t*)data;
 
   const uint8_t* magic_start = header_data + kMagic.offset;
@@ -86,11 +86,11 @@ Result<VulkanDelegateHeader> VulkanDelegateHeader::Parse(const void* data) {
   }
 
   VulkanDelegateHeader header = VulkanDelegateHeader{
-      GetUInt16LE(header_data + kHeaderSize.offset),
-      GetUInt32LE(header_data + kFlatbufferOffset.offset),
-      GetUInt32LE(header_data + kFlatbufferSize.offset),
-      GetUInt32LE(header_data + kBytesOffset.offset),
-      GetUInt64LE(header_data + kBytesSize.offset),
+      getUInt16LE(header_data + kHeaderSize.offset),
+      getUInt32LE(header_data + kFlatbufferOffset.offset),
+      getUInt32LE(header_data + kFlatbufferSize.offset),
+      getUInt32LE(header_data + kBytesOffset.offset),
+      getUInt64LE(header_data + kBytesSize.offset),
   };
 
   if (!header.is_valid()) {
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.h b/backends/vulkan/runtime/VulkanDelegateHeader.h
index f9757ef4c2a..c5e8859743a 100644
--- a/backends/vulkan/runtime/VulkanDelegateHeader.h
+++ b/backends/vulkan/runtime/VulkanDelegateHeader.h
@@ -15,14 +15,14 @@ namespace executor {
 namespace vulkan {
 
 // Byte decoding utilities
-uint64_t GetUInt64LE(const uint8_t* data);
-uint32_t GetUInt32LE(const uint8_t* data);
-uint32_t GetUInt16LE(const uint8_t* data);
+uint64_t getUInt64LE(const uint8_t* data);
+uint32_t getUInt32LE(const uint8_t* data);
+uint32_t getUInt16LE(const uint8_t* data);
 
 struct VulkanDelegateHeader {
   bool is_valid() const;
 
-  static Result<VulkanDelegateHeader> Parse(const void* data);
+  static Result<VulkanDelegateHeader> parse(const void* data);
 
   uint32_t header_size;
   uint32_t flatbuffer_offset;
diff --git a/backends/vulkan/runtime/api/Command.cpp b/backends/vulkan/runtime/api/Command.cpp
index 2ddb4ab15aa..841c40e471a 100644
--- a/backends/vulkan/runtime/api/Command.cpp
+++ b/backends/vulkan/runtime/api/Command.cpp
@@ -392,7 +392,7 @@ CommandPool::CommandPool(
   VK_CHECK(vkCreateCommandPool(device_, &create_info, nullptr, &pool_));
 
   // Pre-allocate some command buffers
-  allocate_new_batch(config_.cmdPoolInitialSize);
+  allocate_new_batch(config_.cmd_pool_initial_size);
 }
 
 CommandPool::~CommandPool() {
@@ -406,7 +406,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   std::lock_guard<std::mutex> lock(mutex_);
 
   // No-ops if there are command buffers available
-  allocate_new_batch(config_.cmdPoolBatchSize);
+  allocate_new_batch(config_.cmd_pool_batch_size);
 
   VkCommandBuffer handle = buffers_[in_use_];
 
diff --git a/backends/vulkan/runtime/api/Command.h b/backends/vulkan/runtime/api/Command.h
index 904631b2ac4..877f84d4cec 100644
--- a/backends/vulkan/runtime/api/Command.h
+++ b/backends/vulkan/runtime/api/Command.h
@@ -130,8 +130,8 @@ class CommandBuffer final {
 };
 
 struct CommandPoolConfig final {
-  uint32_t cmdPoolInitialSize;
-  uint32_t cmdPoolBatchSize;
+  uint32_t cmd_pool_initial_size;
+  uint32_t cmd_pool_batch_size;
 };
 
 class CommandPool final {
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 9a43cf455d6..9f6cdabf396 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -30,12 +30,12 @@ Context::Context(size_t adapter_i, const ContextConfig& config)
       device_(adapter_p_->device_handle()),
       queue_(adapter_p_->request_queue()),
       // Resource pools
-      command_pool_(device_, queue_.family_index, config_.cmdPoolConfig),
-      descriptor_pool_(device_, config_.descriptorPoolConfig),
+      command_pool_(device_, queue_.family_index, config_.cmd_pool_config),
+      descriptor_pool_(device_, config_.descriptor_pool_config),
       fences_(device_),
 // Diagnostics
 #ifdef USE_VULKAN_GPU_DIAGNOSTICS
-      querypool_(config_.queryPoolConfig, adapter_p_),
+      querypool_(config_.query_pool_config, adapter_p_),
 #endif /* USE_VULKAN_GPU_DIAGNOSTICS */
       // Command buffer submission
       cmd_mutex_{},
@@ -143,7 +143,7 @@ bool available() {
 Context* context() {
   static const std::unique_ptr<Context> context([]() -> Context* {
     try {
-      const uint32_t submit_frequency = 16u;
+      const uint32_t cmd_submit_frequency = 16u;
 
       const CommandPoolConfig cmd_config{
           32u, // cmdPoolInitialSize
@@ -165,10 +165,10 @@ Context* context() {
       };
 
       const ContextConfig config{
-          submit_frequency, // cmdSubmitFrequency
-          cmd_config, // cmdPoolConfig
-          descriptor_pool_config, // descriptorPoolConfig
-          query_pool_config, // queryPoolConfig
+          cmd_submit_frequency,
+          cmd_config,
+          descriptor_pool_config,
+          query_pool_config,
       };
 
       return new Context(runtime()->default_adapter_i(), config);
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index d79344dce8d..bddb9e1ae17 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -26,10 +26,10 @@ namespace vkcompute {
 namespace api {
 
 struct ContextConfig final {
-  uint32_t cmdSubmitFrequency;
-  CommandPoolConfig cmdPoolConfig;
-  DescriptorPoolConfig descriptorPoolConfig;
-  QueryPoolConfig queryPoolConfig;
+  uint32_t cmd_submit_frequency;
+  CommandPoolConfig cmd_pool_config;
+  DescriptorPoolConfig descriptor_pool_config;
+  QueryPoolConfig query_pool_config;
 };
 
 //
@@ -485,7 +485,7 @@ inline bool Context::submit_copy(
 
   submit_count_++;
   if (fence_handle != VK_NULL_HANDLE ||
-      submit_count_ >= config_.cmdSubmitFrequency) {
+      submit_count_ >= config_.cmd_submit_frequency) {
     submit_cmd_to_gpu(fence_handle);
     return true;
   }
@@ -568,7 +568,7 @@ inline bool Context::submit_compute_job(
 
   submit_count_++;
   if (fence_handle != VK_NULL_HANDLE ||
-      submit_count_ >= config_.cmdSubmitFrequency) {
+      submit_count_ >= config_.cmd_submit_frequency) {
     submit_cmd_to_gpu(fence_handle);
     return true;
   }
diff --git a/backends/vulkan/runtime/api/Descriptor.cpp b/backends/vulkan/runtime/api/Descriptor.cpp
index 572cc674981..99ca6978594 100644
--- a/backends/vulkan/runtime/api/Descriptor.cpp
+++ b/backends/vulkan/runtime/api/Descriptor.cpp
@@ -235,7 +235,7 @@ DescriptorPool::DescriptorPool(
       config_(config),
       mutex_{},
       piles_{} {
-  if (config.descriptorPoolMaxSets > 0) {
+  if (config.descriptor_pool_max_sets > 0) {
     init(config);
   }
 }
@@ -257,19 +257,19 @@ void DescriptorPool::init(const DescriptorPoolConfig& config) {
   std::vector<VkDescriptorPoolSize> type_sizes{
       {
           VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-          config_.descriptorUniformBufferCount,
+          config_.descriptor_uniform_buffer_count,
       },
       {
           VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-          config_.descriptorStorageBufferCount,
+          config_.descriptor_storage_buffer_count,
       },
       {
           VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          config_.descriptorCombinedSamplerCount,
+          config_.descriptor_combined_sampler_count,
       },
       {
           VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          config_.descriptorStorageBufferCount,
+          config_.descriptor_storage_buffer_count,
       },
   };
 
@@ -277,7 +277,7 @@ void DescriptorPool::init(const DescriptorPoolConfig& config) {
       VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, // sType
       nullptr, // pNext
       0u, // flags
-      config_.descriptorPoolMaxSets, // maxSets
+      config_.descriptor_pool_max_sets, // maxSets
       static_cast<uint32_t>(type_sizes.size()), // poolSizeCounts
       type_sizes.data(), // pPoolSizes
   };
@@ -297,7 +297,7 @@ DescriptorSet DescriptorPool::get_descriptor_set(
              .insert({
                  set_layout,
                  DescriptorSetPile(
-                     config_.descriptorPileSizes, set_layout, device_, pool_),
+                     config_.descriptor_pile_sizes, set_layout, device_, pool_),
              })
              .first;
   }
diff --git a/backends/vulkan/runtime/api/Descriptor.h b/backends/vulkan/runtime/api/Descriptor.h
index 0b6b1cd885a..915a5b824c1 100644
--- a/backends/vulkan/runtime/api/Descriptor.h
+++ b/backends/vulkan/runtime/api/Descriptor.h
@@ -107,14 +107,14 @@ class DescriptorSetPile final {
 
 struct DescriptorPoolConfig final {
   // Overall Pool capacity
-  uint32_t descriptorPoolMaxSets;
+  uint32_t descriptor_pool_max_sets;
   // DescriptorCounts by type
-  uint32_t descriptorUniformBufferCount;
-  uint32_t descriptorStorageBufferCount;
-  uint32_t descriptorCombinedSamplerCount;
-  uint32_t descriptorStorageImageCount;
+  uint32_t descriptor_uniform_buffer_count;
+  uint32_t descriptor_storage_buffer_count;
+  uint32_t descriptor_combined_sampler_count;
+  uint32_t descriptor_storage_image_count;
   // Pile size for pre-allocating descriptor sets
-  uint32_t descriptorPileSizes;
+  uint32_t descriptor_pile_sizes;
 };
 
 class DescriptorPool final {
diff --git a/backends/vulkan/runtime/api/QueryPool.cpp b/backends/vulkan/runtime/api/QueryPool.cpp
index b908c6e53b4..5deff1d4c4c 100644
--- a/backends/vulkan/runtime/api/QueryPool.cpp
+++ b/backends/vulkan/runtime/api/QueryPool.cpp
@@ -42,13 +42,13 @@ QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
       nullptr, // pNext
       0u, // flags
       VK_QUERY_TYPE_TIMESTAMP, // queryType
-      config_.maxQueryCount, // queryCount
+      config_.max_query_count, // queryCount
       0u, // pipelineStatistics
   };
 
   VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
 
-  shader_log().reserve(config_.initialReserveSize);
+  shader_log().reserve(config_.initial_reserve_size);
 
   VK_CHECK_COND(adapter_p, "Valid GPU device must be created for QueryPool");
   ns_per_tick_ = std::lround(adapter_p->timestamp_period());
@@ -79,16 +79,16 @@ void QueryPool::reset(const CommandBuffer& cmd) {
   previous_shader_count_ += shader_log().size();
   in_use_ = 0u;
   shader_logs_.emplace_back();
-  shader_log().reserve(config_.initialReserveSize);
+  shader_log().reserve(config_.initial_reserve_size);
   results_pending_ = false;
 }
 
 size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
   VK_CHECK_COND(
-      in_use_ < config_.maxQueryCount,
+      in_use_ < config_.max_query_count,
       "Vulkan QueryPool: Exceeded the maximum number of queries "
       "allowed by the queryPool (",
-      config_.maxQueryCount,
+      config_.max_query_count,
       ")!");
 
   cmd.write_timestamp(querypool_, in_use_);
diff --git a/backends/vulkan/runtime/api/QueryPool.h b/backends/vulkan/runtime/api/QueryPool.h
index 9249942df08..a0c6d9b14f1 100644
--- a/backends/vulkan/runtime/api/QueryPool.h
+++ b/backends/vulkan/runtime/api/QueryPool.h
@@ -22,8 +22,8 @@ namespace vkcompute {
 namespace api {
 
 struct QueryPoolConfig final {
-  uint32_t maxQueryCount;
-  uint32_t initialReserveSize;
+  uint32_t max_query_count;
+  uint32_t initial_reserve_size;
 };
 
 struct ShaderDuration final {
diff --git a/backends/vulkan/runtime/api/Runtime.cpp b/backends/vulkan/runtime/api/Runtime.cpp
index e113a4e3b4f..ebed34162f3 100644
--- a/backends/vulkan/runtime/api/Runtime.cpp
+++ b/backends/vulkan/runtime/api/Runtime.cpp
@@ -91,7 +91,7 @@ VkInstance create_instance(const RuntimeConfiguration& config) {
   std::vector<const char*> enabled_layers;
   std::vector<const char*> enabled_extensions;
 
-  if (config.enableValidationMessages) {
+  if (config.enable_validation_messages) {
     std::vector<const char*> requested_layers{
         // "VK_LAYER_LUNARG_api_dump",
         "VK_LAYER_KHRONOS_validation",
@@ -175,7 +175,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
 VkDebugReportCallbackEXT create_debug_report_callback(
     VkInstance instance,
     const RuntimeConfiguration config) {
-  if (VK_NULL_HANDLE == instance || !config.enableValidationMessages) {
+  if (VK_NULL_HANDLE == instance || !config.enable_validation_messages) {
     return VkDebugReportCallbackEXT{};
   }
 
@@ -245,20 +245,20 @@ std::unique_ptr<Runtime> init_global_vulkan_runtime() {
   }
 #endif /* USE_VULKAN_VOLK, USE_VULKAN_WRAPPER */
 
-  const bool enableValidationMessages =
+  const bool enable_validation_messages =
 #if defined(VULKAN_DEBUG)
       true;
 #else
       false;
 #endif /* VULKAN_DEBUG */
-  const bool initDefaultDevice = true;
-  const uint32_t numRequestedQueues = 1; // TODO: raise this value
+  const bool init_default_device = true;
+  const uint32_t num_requested_queues = 1; // TODO: raise this value
 
   const RuntimeConfiguration default_config{
-      enableValidationMessages,
-      initDefaultDevice,
+      enable_validation_messages,
+      init_default_device,
       AdapterSelector::First,
-      numRequestedQueues,
+      num_requested_queues,
   };
 
   try {
@@ -281,9 +281,9 @@ Runtime::Runtime(const RuntimeConfiguration config)
   // List of adapters will never exceed the number of physical devices
   adapters_.reserve(device_mappings_.size());
 
-  if (config.initDefaultDevice) {
+  if (config.init_default_device) {
     try {
-      switch (config.defaultSelector) {
+      switch (config.default_selector) {
         case AdapterSelector::First:
           default_adapter_i_ = create_adapter(select_first);
       }
@@ -350,8 +350,8 @@ uint32_t Runtime::create_adapter(const Selector& selector) {
   }
   // Otherwise, create an adapter for the selected physical device
   adapter_i = utils::safe_downcast<int32_t>(adapters_.size());
-  adapters_.emplace_back(
-      new Adapter(instance_, device_mapping.first, config_.numRequestedQueues));
+  adapters_.emplace_back(new Adapter(
+      instance_, device_mapping.first, config_.num_requested_queues));
   device_mapping.second = adapter_i;
 
   return adapter_i;
diff --git a/backends/vulkan/runtime/api/Runtime.h b/backends/vulkan/runtime/api/Runtime.h
index f54bd7522ac..6cfcc0ca03a 100644
--- a/backends/vulkan/runtime/api/Runtime.h
+++ b/backends/vulkan/runtime/api/Runtime.h
@@ -35,10 +35,10 @@ enum AdapterSelector {
 };
 
 struct RuntimeConfiguration final {
-  bool enableValidationMessages;
-  bool initDefaultDevice;
-  AdapterSelector defaultSelector;
-  uint32_t numRequestedQueues;
+  bool enable_validation_messages;
+  bool init_default_device;
+  AdapterSelector default_selector;
+  uint32_t num_requested_queues;
 };
 
 class Runtime final {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 0c7941d6f52..2e8f4c007d0 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -56,7 +56,7 @@ ComputeGraph::ComputeGraph(GraphConfig config)
       execute_descriptor_counts_{},
       context_{new api::Context(
           api::runtime()->default_adapter_i(),
-          config_.contextConfig)},
+          config_.context_config)},
       shared_objects_{},
       values_{},
       param_ubos_{},
@@ -65,17 +65,17 @@ ComputeGraph::ComputeGraph(GraphConfig config)
       inputs_{},
       outputs_{} {
   // Ensure that descriptor counts are initialized to 0
-  prepack_descriptor_counts_.descriptorPoolMaxSets = 0;
-  prepack_descriptor_counts_.descriptorUniformBufferCount = 0;
-  prepack_descriptor_counts_.descriptorStorageBufferCount = 0;
-  prepack_descriptor_counts_.descriptorCombinedSamplerCount = 0;
-  prepack_descriptor_counts_.descriptorStorageImageCount = 0;
-
-  execute_descriptor_counts_.descriptorPoolMaxSets = 0;
-  execute_descriptor_counts_.descriptorUniformBufferCount = 0;
-  execute_descriptor_counts_.descriptorStorageBufferCount = 0;
-  execute_descriptor_counts_.descriptorCombinedSamplerCount = 0;
-  execute_descriptor_counts_.descriptorStorageImageCount = 0;
+  prepack_descriptor_counts_.descriptor_pool_max_sets = 0;
+  prepack_descriptor_counts_.descriptor_uniform_buffer_count = 0;
+  prepack_descriptor_counts_.descriptor_storage_buffer_count = 0;
+  prepack_descriptor_counts_.descriptor_combined_sampler_count = 0;
+  prepack_descriptor_counts_.descriptor_storage_image_count = 0;
+
+  execute_descriptor_counts_.descriptor_pool_max_sets = 0;
+  execute_descriptor_counts_.descriptor_uniform_buffer_count = 0;
+  execute_descriptor_counts_.descriptor_storage_buffer_count = 0;
+  execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
+  execute_descriptor_counts_.descriptor_storage_image_count = 0;
 
   context_->set_cmd(/*reusable = */ true);
 }
@@ -95,20 +95,20 @@ void ComputeGraph::update_descriptor_counts(
   api::DescriptorPoolConfig* config =
       execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_;
 
-  config->descriptorPoolMaxSets += 1;
+  config->descriptor_pool_max_sets += 1;
   for (const VkDescriptorType arg_type : shader_info.kernel_layout) {
     switch (arg_type) {
       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-        config->descriptorUniformBufferCount += 1;
+        config->descriptor_uniform_buffer_count += 1;
         break;
       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-        config->descriptorStorageBufferCount += 1;
+        config->descriptor_storage_buffer_count += 1;
         break;
       case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-        config->descriptorCombinedSamplerCount += 1;
+        config->descriptor_combined_sampler_count += 1;
         break;
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-        config->descriptorStorageImageCount += 1;
+        config->descriptor_storage_image_count += 1;
         break;
       default:
         VK_THROW("Unsupported descriptor type!");
@@ -117,16 +117,16 @@ void ComputeGraph::update_descriptor_counts(
 }
 
 api::StorageType ComputeGraph::suggested_storage_type() {
-  if (config_.enableStorageTypeOverride) {
-    return config_.storageTypeOverride;
+  if (config_.enable_storage_type_override) {
+    return config_.storage_type_override;
   }
   return api::kTexture3D;
 }
 
 api::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
     const std::vector<int64_t>& sizes) {
-  if (config_.enableMemoryLayoutOverride) {
-    return config_.memoryLayoutOverride;
+  if (config_.enable_memory_layout_override) {
+    return config_.memory_layout_override;
   }
   if (sizes.size() < 3) {
     return api::kWidthPacked;
@@ -319,15 +319,15 @@ void ComputeGraph::prepare() {
       std::max(                               \
           execute_descriptor_counts_.field,   \
           prepack_descriptor_counts_.field) * \
-      config_.descriptorPoolSafetyFactor))
+      config_.descriptor_pool_safety_factor))
 
-  uint32_t max_sets = MERGE_FIELD(descriptorPoolMaxSets);
+  uint32_t max_sets = MERGE_FIELD(descriptor_pool_max_sets);
   api::DescriptorPoolConfig config{
       max_sets,
-      std::max(MERGE_FIELD(descriptorUniformBufferCount), max_sets),
-      std::max(MERGE_FIELD(descriptorStorageBufferCount), max_sets),
-      std::max(MERGE_FIELD(descriptorCombinedSamplerCount), max_sets),
-      std::max(MERGE_FIELD(descriptorStorageImageCount), max_sets),
+      std::max(MERGE_FIELD(descriptor_uniform_buffer_count), max_sets),
+      std::max(MERGE_FIELD(descriptor_storage_buffer_count), max_sets),
+      std::max(MERGE_FIELD(descriptor_combined_sampler_count), max_sets),
+      std::max(MERGE_FIELD(descriptor_storage_image_count), max_sets),
       1u,
   };
 
diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp
index 98b2d9a4263..29de4704395 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.cpp
+++ b/backends/vulkan/runtime/graph/GraphConfig.cpp
@@ -12,12 +12,12 @@ namespace vkcompute {
 
 GraphConfig::GraphConfig() {
   // No automatic submissions
-  const uint32_t submit_frequency = UINT32_MAX;
+  const uint32_t cmd_submit_frequency = UINT32_MAX;
 
   // Only one command buffer will be encoded at a time
   const api::CommandPoolConfig cmd_config{
-      1u, // cmdPoolInitialSize
-      1u, // cmdPoolBatchSize
+      1u, // cmd_pool_initial_size
+      1u, // cmd_pool_batch_size
   };
 
   // Use lazy descriptor pool initialization by default; the graph runtime will
@@ -25,49 +25,48 @@ GraphConfig::GraphConfig() {
   // trigger descriptor pool initialization with exact sizes before encoding the
   // command buffer.
   const api::DescriptorPoolConfig descriptor_pool_config{
-      0u, // descriptorPoolMaxSets
-      0u, // descriptorUniformBufferCount
-      0u, // descriptorStorageBufferCount
-      0u, // descriptorCombinedSamplerCount
-      0u, // descriptorStorageImageCount
-      0u, // descriptorPileSizes
+      0u, // descriptor_pool_max_sets
+      0u, // descriptor_uniform_buffer_count
+      0u, // descriptor_storage_buffer_count
+      0u, // descriptor_combined_sampler_count
+      0u, // descriptor_storage_image_count
+      0u, // descriptor_pile_sizes
   };
 
   const api::QueryPoolConfig query_pool_config{};
 
-  const api::ContextConfig context_config{
-      submit_frequency, // cmdSubmitFrequency
-      cmd_config, // cmdPoolConfig
-      descriptor_pool_config, // descriptorPoolConfig
-      query_pool_config, // queryPoolConfig
+  context_config = {
+      cmd_submit_frequency,
+      cmd_config,
+      descriptor_pool_config,
+      query_pool_config,
   };
 
-  contextConfig = context_config;
-
   // Empirically selected safety factor. If descriptor pools start running out
   // of memory, increase this safety factor.
-  descriptorPoolSafetyFactor = 1.25;
+  descriptor_pool_safety_factor = 1.25;
 
   // For now, force kTexture3D storage as we are still developing shader support
   // for buffer storage type.
-  enableStorageTypeOverride = true;
-  storageTypeOverride = api::kTexture3D;
+  enable_storage_type_override = true;
+  storage_type_override = api::kTexture3D;
 
   // For now, force kWidthPacked memory layout by default as we are still
   // developing support for other memory layouts. In the future memory layout
   // settings will be serialized as part of the graph.
-  enableMemoryLayoutOverride = true;
-  memoryLayoutOverride = api::kWidthPacked;
+  enable_memory_layout_override = true;
+  memory_layout_override = api::kWidthPacked;
 }
 
-void GraphConfig::setStorageTypeOverride(api::StorageType storage_type) {
-  enableStorageTypeOverride = true;
-  storageTypeOverride = storage_type;
+void GraphConfig::set_storage_type_override(api::StorageType storage_type) {
+  enable_storage_type_override = true;
+  storage_type_override = storage_type;
 }
 
-void GraphConfig::setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout) {
-  enableMemoryLayoutOverride = true;
-  memoryLayoutOverride = memory_layout;
+void GraphConfig::set_memory_layout_override(
+    api::GPUMemoryLayout memory_layout) {
+  enable_memory_layout_override = true;
+  memory_layout_override = memory_layout;
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
index 7fb99f50407..f3e311daa22 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -13,26 +13,26 @@
 namespace vkcompute {
 
 struct GraphConfig final {
-  api::ContextConfig contextConfig;
+  api::ContextConfig context_config;
 
   // Creating a descriptor pool with exactly the number of descriptors tallied
   // by iterating through the shader layouts of shaders used in the graph risks
   // the descriptor pool running out of memory, therefore apply a safety factor
   // to descriptor counts when creating the descriptor pool to mitigate this
   // risk.
-  float descriptorPoolSafetyFactor;
+  float descriptor_pool_safety_factor;
 
-  bool enableStorageTypeOverride;
-  api::StorageType storageTypeOverride;
+  bool enable_storage_type_override;
+  api::StorageType storage_type_override;
 
-  bool enableMemoryLayoutOverride;
-  api::GPUMemoryLayout memoryLayoutOverride;
+  bool enable_memory_layout_override;
+  api::GPUMemoryLayout memory_layout_override;
 
   // Generate a default graph config with pre-configured settings
   explicit GraphConfig();
 
-  void setStorageTypeOverride(api::StorageType storage_type);
-  void setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout);
+  void set_storage_type_override(api::StorageType storage_type);
+  void set_memory_layout_override(api::GPUMemoryLayout memory_layout);
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
index a43998b47c9..7cfe71eee7f 100644
--- a/backends/vulkan/test/op_tests/utils/codegen.py
+++ b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -582,8 +582,8 @@ class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple
         api::StorageType default_storage_type;
         api::GPUMemoryLayout default_memory_layout;
         std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
-        config.setStorageTypeOverride(default_storage_type);
-        config.setMemoryLayoutOverride(default_memory_layout);
+        config.set_storage_type_override(default_storage_type);
+        config.set_memory_layout_override(default_memory_layout);
         graph = new ComputeGraph(config);
 
         if (test_dtype == at::kHalf) {{

From 2d68bd35d858a9a5dca6ca8836d34d37ac3956d6 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Wed, 8 May 2024 16:31:56 -0700
Subject: [PATCH 08/62] Rename `Allocator.*` as `vma_api.*` (#3552)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3552

We currently use `vk_api.h` for inclusion of third-party `vulkan-headers`.

To adhere to the same style, we rename as `vma_api.h` for inclusion of third-party `VulkanMemoryAllocator`. (This also opens the door to renaming our wrapper `MemoryAllocator` to `Allocator` in the next change.)
ghstack-source-id: 225636265
exported-using-ghexport
bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: copyrightly, SS-JIA

Differential Revision: D57126895

fbshipit-source-id: ee1a9ee3af799de33c9e1222f031754ce1da16f2
---
 backends/vulkan/runtime/api/Resource.h                     | 2 +-
 backends/vulkan/runtime/api/{Allocator.cpp => vma_api.cpp} | 2 +-
 backends/vulkan/runtime/api/{Allocator.h => vma_api.h}     | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)
 rename backends/vulkan/runtime/api/{Allocator.cpp => vma_api.cpp} (80%)
 rename backends/vulkan/runtime/api/{Allocator.h => vma_api.h} (92%)

diff --git a/backends/vulkan/runtime/api/Resource.h b/backends/vulkan/runtime/api/Resource.h
index 81388cdcb06..247e2f1c932 100644
--- a/backends/vulkan/runtime/api/Resource.h
+++ b/backends/vulkan/runtime/api/Resource.h
@@ -11,8 +11,8 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
 #include <executorch/backends/vulkan/runtime/api/vk_api.h>
+#include <executorch/backends/vulkan/runtime/api/vma_api.h>
 
-#include <executorch/backends/vulkan/runtime/api/Allocator.h>
 #include <executorch/backends/vulkan/runtime/api/Types.h>
 #include <executorch/backends/vulkan/runtime/api/Utils.h>
 
diff --git a/backends/vulkan/runtime/api/Allocator.cpp b/backends/vulkan/runtime/api/vma_api.cpp
similarity index 80%
rename from backends/vulkan/runtime/api/Allocator.cpp
rename to backends/vulkan/runtime/api/vma_api.cpp
index 3cedaa2f5af..26672339adf 100644
--- a/backends/vulkan/runtime/api/Allocator.cpp
+++ b/backends/vulkan/runtime/api/vma_api.cpp
@@ -7,4 +7,4 @@
  */
 
 #define VMA_IMPLEMENTATION
-#include <executorch/backends/vulkan/runtime/api/Allocator.h>
+#include <executorch/backends/vulkan/runtime/api/vma_api.h>
diff --git a/backends/vulkan/runtime/api/Allocator.h b/backends/vulkan/runtime/api/vma_api.h
similarity index 92%
rename from backends/vulkan/runtime/api/Allocator.h
rename to backends/vulkan/runtime/api/vma_api.h
index a5a9ea02a98..34e3219d934 100644
--- a/backends/vulkan/runtime/api/Allocator.h
+++ b/backends/vulkan/runtime/api/vma_api.h
@@ -10,11 +10,9 @@
 
 //
 // Do NOT include vk_mem_alloc.h directly.
-// Always include this file (Allocator.h) instead.
+// Always include this file (vma_api.h) instead.
 //
 
-#include <executorch/backends/vulkan/runtime/api/vk_api.h>
-
 #define VMA_VULKAN_VERSION 1000000
 
 #ifdef USE_VULKAN_WRAPPER

From f8403ed16f2da6e5d6db80a4a4ebe58a05215b91 Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@meta.com>
Date: Wed, 8 May 2024 18:09:17 -0700
Subject: [PATCH 09/62] Error out when token is outside of vocab size (#3535)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3535

Ideally it shouldn't happen, but if we post process the weight somehow too much it might happen. In Android, it just seg fault directly if it's outside of the range without error message. After this change, it's clearer:
```
E 00:00:00.180911 executorch:bpe_tokenizer.cpp:155] token 18446744073709551615 is out side of vacab range 512
Aborted
```

Reviewed By: larryliu0820

Differential Revision: D57057026

fbshipit-source-id: 838260d60b75e7c392d7f496d7cdf6f81957f56c
---
 .../models/llama2/tokenizer/bpe_tokenizer.cpp    |  5 +----
 .../llama2/tokenizer/test/test_bpe_tokenizer.cpp |  8 ++++++++
 .../llama2/tokenizer/test/test_tiktoken.cpp      |  9 +++++++++
 examples/models/llama2/tokenizer/tiktoken.cpp    |  4 +---
 examples/models/llama2/tokenizer/tokenizer.h     | 16 ++++++++++++++++
 5 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.cpp b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp
index ed7d34aca4d..7af2357d9be 100644
--- a/examples/models/llama2/tokenizer/bpe_tokenizer.cpp
+++ b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp
@@ -146,10 +146,7 @@ BPETokenizer::~BPETokenizer() {
  * token.
  */
 Result<std::string> BPETokenizer::decode(uint64_t prev_token, uint64_t token) {
-  if (!initialized_) {
-    ET_LOG(Error, "Tokenizer not initialized");
-    return Error::NotSupported;
-  }
+  ET_CHECK_OK_OR_RETURN_ERROR(Tokenizer::decode_verify(token));
   const char* piece = vocab_[token];
   // following BOS token, sentencepiece decoder strips any leading
   // whitespace
diff --git a/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
index 1d1f83065cf..e9eada338d5 100644
--- a/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
+++ b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
@@ -39,6 +39,14 @@ TEST_F(TokenizerExtensionTest, DecodeWithoutLoadFails) {
   EXPECT_EQ(result.error(), Error::NotSupported);
 }
 
+TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) {
+  Error res = tokenizer_->load(modelPath_.c_str());
+  EXPECT_EQ(res, Error::Ok);
+  auto result = tokenizer_->decode(0, 64000);
+  // The vocab size is 32000, and token 64000 is out of vocab range.
+  EXPECT_EQ(result.error(), Error::NotSupported);
+}
+
 TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
index 2f08e2a1aa7..6130a9e858a 100644
--- a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
+++ b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
@@ -77,5 +77,14 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
   }
 }
 
+TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
+  Error res = tokenizer_->load(modelPath_.c_str());
+  EXPECT_EQ(res, Error::Ok);
+  // The vocab size is 128256, addes 256 just so the token is out of vocab
+  // range.
+  Result<std::string> out = tokenizer_->decode(0, 128256 + 256);
+  EXPECT_EQ(out.error(), Error::NotSupported);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/examples/models/llama2/tokenizer/tiktoken.cpp b/examples/models/llama2/tokenizer/tiktoken.cpp
index 849a2ff1e8d..79b61e5eb64 100644
--- a/examples/models/llama2/tokenizer/tiktoken.cpp
+++ b/examples/models/llama2/tokenizer/tiktoken.cpp
@@ -364,9 +364,7 @@ Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
 
 Result<std::string> Tiktoken::decode(uint64_t prev, uint64_t cur) {
   (void)prev;
-  if (!initialized_) {
-    return Error::NotSupported;
-  }
+  ET_CHECK_OK_OR_RETURN_ERROR(Tokenizer::decode_verify(cur));
   std::string ret;
 
   std::string token_bytes;
diff --git a/examples/models/llama2/tokenizer/tokenizer.h b/examples/models/llama2/tokenizer/tokenizer.h
index 5e9f0925823..7ad3b32bbb8 100644
--- a/examples/models/llama2/tokenizer/tokenizer.h
+++ b/examples/models/llama2/tokenizer/tokenizer.h
@@ -40,6 +40,22 @@ class Tokenizer {
   virtual Result<std::vector<uint64_t>>
   encode(const std::string& input, int8_t bos, int8_t eos) = 0;
 
+  Error decode_verify(uint64_t token) const {
+    if (!initialized_) {
+      ET_LOG(Error, "Tokenizer not initialized");
+      return Error::NotSupported;
+    }
+    if (token >= vocab_size_) {
+      ET_LOG(
+          Error,
+          "token  %" PRIu64 " is out side of vacab range %d",
+          token,
+          vocab_size_);
+      return Error::NotSupported;
+    }
+    return Error::Ok;
+  }
+
   virtual Result<std::string> decode(uint64_t prev_token, uint64_t token) = 0;
 
   // getters

From dd81fc76d0ab2301202bd131135627aea370b357 Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo@zingo.org>
Date: Thu, 9 May 2024 08:44:54 -0700
Subject: [PATCH 10/62] Fix backend/arm tests failing when there is no tosa
 installed (#3560)

Summary:
This fixes problem with clone and view unit test if TOSA is not installed

Pull Request resolved: https://github.com/pytorch/executorch/pull/3560

Reviewed By: mergennachin

Differential Revision: D57161708

Pulled By: digantdesai

fbshipit-source-id: e4b6733ef4da89bb894c5197a90912eaa7fe4b5c
---
 backends/arm/test/ops/test_clone.py | 4 ++--
 backends/arm/test/ops/test_view.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 32554dfadd6..2eb94a82322 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -73,8 +73,8 @@ def _test_clone_tosa_BI_pipeline(
         if common.TOSA_REF_MODEL_INSTALLED:
             tester.run_method_and_compare_outputs(qtol=1)
         else:
-            logger.warning(
-                "TOSA ref model tool not installed, skip numerical correctness tests"
+            raise RuntimeError(
+                "TOSA ref model tool not installed and the test is an expected fail"
             )
 
     def _test_clone_tosa_u55_pipeline(
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 0620ecb49b4..fddd21ed2fb 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -70,8 +70,8 @@ def _test_view_tosa_BI_pipeline(
         if common.TOSA_REF_MODEL_INSTALLED:
             tester.run_method_and_compare_outputs(qtol=1)
         else:
-            logger.warning(
-                "TOSA ref model tool not installed, skip numerical correctness tests"
+            raise RuntimeError(
+                "TOSA ref model tool not installed and the test is an expected fail"
             )
 
     def _test_view_u55_BI_pipeline(

From fff20a738c5bb5c1cb259468b0a8aa6c6be8cd38 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Thu, 9 May 2024 09:46:22 -0700
Subject: [PATCH 11/62] Update pytorch pin (#3562)

Summary:
Action item for: https://github.com/pytorch/executorch/issues/3561

There are torch nightlies where yaml files don't exist (https://github.com/pytorch/pytorch/issues/124941) in certain wheels.

Pull Request resolved: https://github.com/pytorch/executorch/pull/3562

Test Plan: `unzip -t torch-2.4.0.dev20240507+cu118-cp38-cp38-linux_x86_64.whl | grep yaml` and make sure yaml files exist.

Reviewed By: tarun292

Differential Revision: D57163422

Pulled By: mergennachin

fbshipit-source-id: ba087ccd31315932c3203d5d7e5ec0d7a19878b6
---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 install_requirements.sh               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index ccee7739dc6..ddd1f4a6b16 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-b1984237a0fb32b760c1b84d6d02d2f0f7ed293b
+48b6c8dbc376db4406a979b35cd6909bcb428931
diff --git a/install_requirements.sh b/install_requirements.sh
index c5b45706709..d88eb505a6c 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -59,7 +59,7 @@ done
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION=dev20240422
+NIGHTLY_VERSION=dev20240507
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"

From 21f5fbffe58d32569cc808f879b6d2506b278e20 Mon Sep 17 00:00:00 2001
From: Angela Yi <angelayi@meta.com>
Date: Thu, 9 May 2024 10:36:37 -0700
Subject: [PATCH 12/62] Catch check on symbolic shapes (#3537)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3537

Fixing P1215895395

Reviewed By: tarun292

Differential Revision: D56325190

fbshipit-source-id: a0d6edf84fa783f11b31f3340a94851738cb50b1
---
 exir/tensor.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/exir/tensor.py b/exir/tensor.py
index ee2633654e8..ee074cf7119 100644
--- a/exir/tensor.py
+++ b/exir/tensor.py
@@ -37,7 +37,11 @@ def contiguous_stride_from_shape(shape: torch.Size) -> Tuple[int]:
         strides.append(accum)
         # For sizes[i] == 0, treat it as 1 to be consistent with core Pytorch
         # This preserves the PT equivalent behavior for dims with 0 elements
-        if sz != 0:
+        if isinstance(sz, int):
+            if sz != 0:
+                accum *= sz
+        else:
+            # Unbacked symints may error on the != 0 check
             accum *= sz
     return tuple(reversed(strides))
 

From 38c30d6fa5f2841012e4f6e02d2ed07c687f7e6f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 9 May 2024 11:52:44 -0700
Subject: [PATCH 13/62] use utils:{min,max}_override in {min,max}imum ops
 (#3453)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3453

Noticed this inconsistency with clamp.

Reviewed By: manuelcandales

Differential Revision: D56846313

fbshipit-source-id: 2fd891fd774101ad56c21cbea4984e2d9a7c9c20
---
 kernels/portable/cpu/op_maximum.cpp | 8 ++------
 kernels/portable/cpu/op_minimum.cpp | 8 ++------
 kernels/portable/cpu/targets.bzl    | 2 ++
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 4091f2cf8ca..1353479b294 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -15,11 +16,6 @@ namespace executor {
 namespace native {
 namespace {
 
-template <class T>
-const T& max(const T& a, const T& b) {
-  return (b > a) ? b : a;
-}
-
 template <
     bool can_cast,
     typename CTYPE_A,
@@ -40,7 +36,7 @@ struct MaximumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
         [](const CTYPE_A val_a, const CTYPE_B val_b) {
           CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
           CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-          CTYPE_IN value = max(a_casted, b_casted);
+          CTYPE_IN value = utils::max_override(a_casted, b_casted);
 
           return static_cast<CTYPE_OUT>(value);
         },
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index 44c0efa8a67..f18d1a6d368 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -15,11 +16,6 @@ namespace executor {
 namespace native {
 namespace {
 
-template <class T>
-const T& min(const T& a, const T& b) {
-  return (b < a) ? b : a;
-}
-
 template <
     bool can_cast,
     typename CTYPE_A,
@@ -40,7 +36,7 @@ struct MinimumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
         [](const CTYPE_A val_a, const CTYPE_B val_b) {
           CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
           CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-          CTYPE_IN value = min(a_casted, b_casted);
+          CTYPE_IN value = utils::min_override(a_casted, b_casted);
 
           return static_cast<CTYPE_OUT>(value);
         },
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
index 77796c68526..bffe2fcf48c 100644
--- a/kernels/portable/cpu/targets.bzl
+++ b/kernels/portable/cpu/targets.bzl
@@ -560,6 +560,7 @@ _ATEN_OPS = (
         name = "op_maximum",
         deps = [
             "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/kernels/portable/cpu/util:math_util",
             ":scalar_utils",
         ],
     ),
@@ -591,6 +592,7 @@ _ATEN_OPS = (
         name = "op_minimum",
         deps = [
             "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/kernels/portable/cpu/util:math_util",
             ":scalar_utils",
         ],
     ),

From 8ebe1c9487d4398f9b85c95fe1008e5897797a61 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 9 May 2024 11:52:44 -0700
Subject: [PATCH 14/62] Use compile-time promotion to reduce bitwise op size &
 build time (#3487)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3487

Finally getting close to the end of compile-time promotion for Tensor ops!

Reviewed By: manuelcandales

Differential Revision: D56855548

fbshipit-source-id: ca93db620c88babbb8ae0c7dc7d6a569c3bd13d6
---
 kernels/portable/cpu/op_bitwise_and.cpp   | 63 ++++++--------------
 kernels/portable/cpu/op_bitwise_or.cpp    | 61 ++++++-------------
 kernels/portable/cpu/op_bitwise_xor.cpp   | 64 ++++++--------------
 kernels/portable/cpu/pattern/bitwise_op.h | 72 +++++++++++++++++++++++
 kernels/portable/cpu/pattern/targets.bzl  | 11 ++++
 kernels/portable/cpu/scalar_utils.h       | 22 +++----
 kernels/portable/cpu/targets.bzl          |  3 +
 7 files changed, 152 insertions(+), 144 deletions(-)
 create mode 100644 kernels/portable/cpu/pattern/bitwise_op.h

diff --git a/kernels/portable/cpu/op_bitwise_and.cpp b/kernels/portable/cpu/op_bitwise_and.cpp
index b1078f780a4..de137afbec2 100644
--- a/kernels/portable/cpu/op_bitwise_and.cpp
+++ b/kernels/portable/cpu/op_bitwise_and.cpp
@@ -6,8 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <cmath>
+// patternlint-disable-next-line executorch-cpp-nostdinc
+#include <functional>
 
+#include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -17,20 +19,6 @@ namespace torch {
 namespace executor {
 namespace native {
 
-namespace {
-
-template <typename CTYPE>
-CTYPE bitwise_and(CTYPE a, CTYPE b) {
-  return a & b;
-}
-
-template <>
-bool bitwise_and<bool>(bool a, bool b) {
-  return a && b;
-}
-
-} // namespace
-
 using Tensor = exec_aten::Tensor;
 
 Tensor& bitwise_and_Tensor_out(
@@ -55,38 +43,23 @@ Tensor& bitwise_and_Tensor_out(
       Bool, a_type, ctx, "bitwise_and.Tensor_out", CTYPE_A, [&]() {
         ET_SWITCH_INT_TYPES_AND(
             Bool, b_type, ctx, "bitwise_and.Tensor_out", CTYPE_B, [&]() {
-              ET_SWITCH_INT_TYPES_AND(
+              using CTYPE_IN = typename torch::executor::
+                  promote_types<CTYPE_A, CTYPE_B>::type;
+              ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+              ET_SWITCH_REAL_TYPES_AND(
                   Bool,
-                  common_type,
+                  out_type,
                   ctx,
                   "bitwise_and.Tensor_out",
-                  CTYPE_IN,
+                  CTYPE_OUT,
                   [&]() {
-                    ET_SWITCH_REAL_TYPES_AND(
-                        Bool,
-                        out_type,
-                        ctx,
-                        "bitwise_and.Tensor_out",
-                        CTYPE_OUT,
-                        [&]() {
-                          apply_binary_elementwise_fn<
-                              CTYPE_A,
-                              CTYPE_B,
-                              CTYPE_OUT>(
-                              [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                                CTYPE_IN a_casted =
-                                    static_cast<CTYPE_IN>(val_a);
-                                CTYPE_IN b_casted =
-                                    static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value =
-                                    bitwise_and(a_casted, b_casted);
-
-                                return static_cast<CTYPE_OUT>(value);
-                              },
-                              a,
-                              b,
-                              out);
-                        });
+                    internal::BitwiseOpInner<
+                        can_cast<CTYPE_IN, CTYPE_OUT>::value,
+                        std::bit_and,
+                        CTYPE_A,
+                        CTYPE_B,
+                        CTYPE_IN,
+                        CTYPE_OUT>::run(a, b, out);
                   });
             });
       });
@@ -142,8 +115,8 @@ Tensor& bitwise_and_Scalar_out(
                                     static_cast<CTYPE_IN>(val_a);
                                 CTYPE_IN b_casted =
                                     static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value =
-                                    bitwise_and(a_casted, b_casted);
+                                CTYPE_IN value = std::bit_and<CTYPE_IN>()(
+                                    a_casted, b_casted);
 
                                 return static_cast<CTYPE_OUT>(value);
                               },
diff --git a/kernels/portable/cpu/op_bitwise_or.cpp b/kernels/portable/cpu/op_bitwise_or.cpp
index c13c68d3db4..39707de07ce 100644
--- a/kernels/portable/cpu/op_bitwise_or.cpp
+++ b/kernels/portable/cpu/op_bitwise_or.cpp
@@ -6,8 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <cmath>
+// patternlint-disable-next-line executorch-cpp-nostdinc
+#include <functional>
 
+#include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -17,20 +19,6 @@ namespace torch {
 namespace executor {
 namespace native {
 
-namespace {
-
-template <typename CTYPE>
-CTYPE bitwise_or(CTYPE a, CTYPE b) {
-  return a | b;
-}
-
-template <>
-bool bitwise_or<bool>(bool a, bool b) {
-  return a || b;
-}
-
-} // namespace
-
 using Tensor = exec_aten::Tensor;
 
 Tensor& bitwise_or_Tensor_out(
@@ -55,37 +43,23 @@ Tensor& bitwise_or_Tensor_out(
       Bool, a_type, ctx, "bitwise_or.Tensor_out", CTYPE_A, [&]() {
         ET_SWITCH_INT_TYPES_AND(
             Bool, b_type, ctx, "bitwise_or.Tensor_out", CTYPE_B, [&]() {
-              ET_SWITCH_INT_TYPES_AND(
+              using CTYPE_IN = typename torch::executor::
+                  promote_types<CTYPE_A, CTYPE_B>::type;
+              ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+              ET_SWITCH_REAL_TYPES_AND(
                   Bool,
-                  common_type,
+                  out_type,
                   ctx,
                   "bitwise_or.Tensor_out",
-                  CTYPE_IN,
+                  CTYPE_OUT,
                   [&]() {
-                    ET_SWITCH_REAL_TYPES_AND(
-                        Bool,
-                        out_type,
-                        ctx,
-                        "bitwise_or.Tensor_out",
-                        CTYPE_OUT,
-                        [&]() {
-                          apply_binary_elementwise_fn<
-                              CTYPE_A,
-                              CTYPE_B,
-                              CTYPE_OUT>(
-                              [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                                CTYPE_IN a_casted =
-                                    static_cast<CTYPE_IN>(val_a);
-                                CTYPE_IN b_casted =
-                                    static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value = bitwise_or(a_casted, b_casted);
-
-                                return static_cast<CTYPE_OUT>(value);
-                              },
-                              a,
-                              b,
-                              out);
-                        });
+                    internal::BitwiseOpInner<
+                        can_cast<CTYPE_IN, CTYPE_OUT>::value,
+                        std::bit_or,
+                        CTYPE_A,
+                        CTYPE_B,
+                        CTYPE_IN,
+                        CTYPE_OUT>::run(a, b, out);
                   });
             });
       });
@@ -141,7 +115,8 @@ Tensor& bitwise_or_Scalar_out(
                                     static_cast<CTYPE_IN>(val_a);
                                 CTYPE_IN b_casted =
                                     static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value = bitwise_or(a_casted, b_casted);
+                                CTYPE_IN value =
+                                    std::bit_or<CTYPE_IN>()(a_casted, b_casted);
 
                                 return static_cast<CTYPE_OUT>(value);
                               },
diff --git a/kernels/portable/cpu/op_bitwise_xor.cpp b/kernels/portable/cpu/op_bitwise_xor.cpp
index d2ea8a81cfb..1855485ee52 100644
--- a/kernels/portable/cpu/op_bitwise_xor.cpp
+++ b/kernels/portable/cpu/op_bitwise_xor.cpp
@@ -6,8 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <cmath>
+// patternlint-disable-next-line executorch-cpp-nostdinc
+#include <functional>
 
+#include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -17,20 +19,6 @@ namespace torch {
 namespace executor {
 namespace native {
 
-namespace {
-
-template <typename CTYPE>
-CTYPE bitwise_xor(CTYPE a, CTYPE b) {
-  return a ^ b;
-}
-
-template <>
-bool bitwise_xor<bool>(bool a, bool b) {
-  return a != b;
-}
-
-} // namespace
-
 using Tensor = exec_aten::Tensor;
 
 Tensor& bitwise_xor_Tensor_out(
@@ -38,7 +26,6 @@ Tensor& bitwise_xor_Tensor_out(
     const Tensor& a,
     const Tensor& b,
     Tensor& out) {
-  // Determine output size and resize for dynamic shapes
   ET_KERNEL_CHECK(
       ctx,
       resize_to_broadcast_target_size(a, b, out) == Error::Ok,
@@ -56,38 +43,23 @@ Tensor& bitwise_xor_Tensor_out(
       Bool, a_type, ctx, "bitwise_xor.Tensor_out", CTYPE_A, [&]() {
         ET_SWITCH_INT_TYPES_AND(
             Bool, b_type, ctx, "bitwise_xor.Tensor_out", CTYPE_B, [&]() {
-              ET_SWITCH_INT_TYPES_AND(
+              using CTYPE_IN = typename torch::executor::
+                  promote_types<CTYPE_A, CTYPE_B>::type;
+              ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+              ET_SWITCH_REAL_TYPES_AND(
                   Bool,
-                  common_type,
+                  out_type,
                   ctx,
                   "bitwise_xor.Tensor_out",
-                  CTYPE_IN,
+                  CTYPE_OUT,
                   [&]() {
-                    ET_SWITCH_REAL_TYPES_AND(
-                        Bool,
-                        out_type,
-                        ctx,
-                        "bitwise_xor.Tensor_out",
-                        CTYPE_OUT,
-                        [&]() {
-                          apply_binary_elementwise_fn<
-                              CTYPE_A,
-                              CTYPE_B,
-                              CTYPE_OUT>(
-                              [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                                CTYPE_IN a_casted =
-                                    static_cast<CTYPE_IN>(val_a);
-                                CTYPE_IN b_casted =
-                                    static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value =
-                                    bitwise_xor(a_casted, b_casted);
-
-                                return static_cast<CTYPE_OUT>(value);
-                              },
-                              a,
-                              b,
-                              out);
-                        });
+                    internal::BitwiseOpInner<
+                        can_cast<CTYPE_IN, CTYPE_OUT>::value,
+                        std::bit_xor,
+                        CTYPE_A,
+                        CTYPE_B,
+                        CTYPE_IN,
+                        CTYPE_OUT>::run(a, b, out);
                   });
             });
       });
@@ -143,8 +115,8 @@ Tensor& bitwise_xor_Scalar_out(
                                     static_cast<CTYPE_IN>(val_a);
                                 CTYPE_IN b_casted =
                                     static_cast<CTYPE_IN>(val_b);
-                                CTYPE_IN value =
-                                    bitwise_xor(a_casted, b_casted);
+                                CTYPE_IN value = std::bit_xor<CTYPE_IN>()(
+                                    a_casted, b_casted);
 
                                 return static_cast<CTYPE_OUT>(value);
                               },
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
new file mode 100644
index 00000000000..dda4fe5cd55
--- /dev/null
+++ b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace internal {
+
+template <
+    bool can_cast,
+    template <typename>
+    class OpFunc,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct BitwiseOpInner;
+
+template <
+    template <typename>
+    class OpFunc,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct BitwiseOpInner<true, OpFunc, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = OpFunc<CTYPE_IN>()(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    template <typename>
+    class OpFunc,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct BitwiseOpInner<false, OpFunc, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace internal
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/pattern/targets.bzl b/kernels/portable/cpu/pattern/targets.bzl
index 360d991767b..7e0b71ed950 100644
--- a/kernels/portable/cpu/pattern/targets.bzl
+++ b/kernels/portable/cpu/pattern/targets.bzl
@@ -6,6 +6,17 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
+    runtime.cxx_library(
+        name = "bitwise_op",
+        exported_headers = [
+            "bitwise_op.h",
+        ],
+        compiler_flags = [],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
+    )
 
     runtime.cxx_library(
         name = "pattern",
diff --git a/kernels/portable/cpu/scalar_utils.h b/kernels/portable/cpu/scalar_utils.h
index 989e7978fc3..3daf3e72526 100644
--- a/kernels/portable/cpu/scalar_utils.h
+++ b/kernels/portable/cpu/scalar_utils.h
@@ -84,9 +84,9 @@ template <typename T1, typename T2, bool half_to_float = false>
 struct promote_type_with_scalar_type {
  private:
   static_assert(
-      std::is_same<T2, internal::B1>::value ||
-          std::is_same<T2, internal::I8>::value ||
-          std::is_same<T2, internal::F8>::value,
+      std::is_same<T2, torch::executor::internal::B1>::value ||
+          std::is_same<T2, torch::executor::internal::I8>::value ||
+          std::is_same<T2, torch::executor::internal::F8>::value,
       "scalar type can only be Bool, Long or Double");
   static_assert(
       !is_qint_type<T1>::value,
@@ -102,17 +102,19 @@ struct promote_type_with_scalar_type {
       "promote_type_with_scalar_type not valid for BFloat16");
   using promote_type_with_scalar_type_not_respecting_half_to_float =
       typename std::conditional<
-          is_complex_type<T1>::value || std::is_same<T2, internal::B1>::value,
+          is_complex_type<T1>::value ||
+              std::is_same<T2, torch::executor::internal::B1>::value,
           T1,
           typename std::conditional<
-              std::is_same<T2, internal::I8>::value,
+              std::is_same<T2, torch::executor::internal::I8>::value,
               typename std::conditional<
-                  std::is_same<T1, internal::B1>::value,
-                  internal::I8,
+                  std::is_same<T1, torch::executor::internal::B1>::value,
+                  torch::executor::internal::I8,
                   T1>::type,
-              typename std::
-                  conditional<is_floating_point<T1>::value, T1, internal::F4>::
-                      type>::type>::type;
+              typename std::conditional<
+                  is_floating_point<T1>::value,
+                  T1,
+                  torch::executor::internal::F4>::type>::type>::type;
 
  public:
   using type = typename std::conditional<
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
index bffe2fcf48c..7be1d94d2bf 100644
--- a/kernels/portable/cpu/targets.bzl
+++ b/kernels/portable/cpu/targets.bzl
@@ -142,6 +142,7 @@ _ATEN_OPS = (
         deps = [
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
+            "//executorch/kernels/portable/cpu/pattern:bitwise_op",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
             "//executorch/kernels/portable/cpu/util:functional_util",
             ":scalar_utils",
@@ -160,6 +161,7 @@ _ATEN_OPS = (
         deps = [
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
+            "//executorch/kernels/portable/cpu/pattern:bitwise_op",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
             "//executorch/kernels/portable/cpu/util:functional_util",
             ":scalar_utils",
@@ -170,6 +172,7 @@ _ATEN_OPS = (
         deps = [
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
+            "//executorch/kernels/portable/cpu/pattern:bitwise_op",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
             "//executorch/kernels/portable/cpu/util:functional_util",
             ":scalar_utils",

From 5adc8bfb6c6caa9f9dbd5e983c42bdfca242baa0 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 9 May 2024 11:52:44 -0700
Subject: [PATCH 15/62] Use compile-time promotion to reduce optimized mul op
 size & build time (#3532)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3532

another in a long line of fixes.

Reviewed By: manuelcandales

Differential Revision: D56896048

fbshipit-source-id: c945f2bf4028944591332e9eda3ce8046d7cc049
---
 kernels/optimized/cpu/op_mul.cpp | 73 ++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 14 deletions(-)

diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
index 3b2926a8a74..adcd8999150 100644
--- a/kernels/optimized/cpu/op_mul.cpp
+++ b/kernels/optimized/cpu/op_mul.cpp
@@ -41,6 +41,50 @@ bool can_use_optimized_path(
        (a.numel() == b.numel() && a.numel() == out.numel()));
   return can_use_optimized_path;
 }
+
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MulInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MulInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
 } // namespace
 
 Tensor& opt_mul_out(
@@ -86,20 +130,21 @@ Tensor& opt_mul_out(
 
     ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
       ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
-        ET_SWITCH_REALB_TYPES(common_type, ctx, "mul.out", CTYPE_IN, [&]() {
-          ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
-            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                  CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                  CTYPE_IN value = a_casted * b_casted;
-
-                  return static_cast<CTYPE_OUT>(value);
-                },
-                a,
-                b,
-                out);
-          });
+        using CTYPE_IN = typename torch::executor::
+            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+        ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
+          apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+              [](const CTYPE_A val_a, const CTYPE_B val_b) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                CTYPE_IN value = a_casted * b_casted;
+
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a,
+              b,
+              out);
         });
       });
     });

From a4110e0820ff4c027c2e2b43b9a0496fa8d28f0b Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 9 May 2024 11:52:44 -0700
Subject: [PATCH 16/62] Use compile-time promotion to reduce optimized add/sub
 op size & build time (#3533)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3533

Yet another pair of ops.

Reviewed By: manuelcandales

Differential Revision: D57023819

fbshipit-source-id: b3ce993c6926d0e1e277278e8a5a4638429a4a1e
---
 kernels/optimized/cpu/op_add.cpp | 83 ++++++++++++++++++++++++--------
 kernels/optimized/cpu/op_sub.cpp | 82 +++++++++++++++++++++++--------
 2 files changed, 125 insertions(+), 40 deletions(-)

diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp
index c11c9977fe5..b62c3b154fa 100644
--- a/kernels/optimized/cpu/op_add.cpp
+++ b/kernels/optimized/cpu/op_add.cpp
@@ -16,6 +16,55 @@
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
+
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted + alpha_val * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+template <typename CTYPE_IN>
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug<CTYPE_IN> {};
+
+} // namespace
 
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
@@ -69,26 +118,20 @@ Tensor& opt_add_out(
 
     ET_SWITCH_REALHB_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() {
       ET_SWITCH_REALHB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
-        ET_SWITCH_REALB_TYPES(common_type, ctx, "add.out", CTYPE_IN, [&]() {
-          ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
-            CTYPE_IN alpha_val;
-            ET_KERNEL_CHECK(
-                ctx,
-                utils::extract_scalar(alpha, &alpha_val),
-                InvalidArgument, );
-
-            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
-                  CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                  CTYPE_IN value = a_casted + alpha_val * b_casted;
-
-                  return static_cast<CTYPE_OUT>(value);
-                },
-                a,
-                b,
-                out);
-          });
+        using CTYPE_IN = typename torch::executor::
+            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+        ET_SWITCH_REALHB_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
+          CTYPE_IN alpha_val;
+          ET_KERNEL_CHECK(
+              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+
+          AddInner<
+              can_cast<CTYPE_IN, CTYPE_OUT>::value,
+              CTYPE_A,
+              CTYPE_B,
+              CTYPE_IN,
+              CTYPE_OUT>::run(a, b, alpha_val, out);
         });
       });
     });
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index 77917c0eda6..87368f3ed76 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -17,6 +17,55 @@
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
+
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct SubInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct SubInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted - alpha_val * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+template <typename CTYPE_IN>
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct SubInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug<CTYPE_IN> {};
+
+} // namespace
 
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
@@ -72,26 +121,19 @@ Tensor& opt_sub_out(
 
     ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
       ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
-        ET_SWITCH_REAL_TYPES(common_type, ctx, "sub.out", CTYPE_IN, [&]() {
-          ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
-            CTYPE_IN alpha_val;
-            ET_KERNEL_CHECK(
-                ctx,
-                utils::extract_scalar(alpha, &alpha_val),
-                InvalidArgument, );
-
-            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
-                  CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                  CTYPE_IN value = a_casted - alpha_val * b_casted;
-
-                  return static_cast<CTYPE_OUT>(value);
-                },
-                a,
-                b,
-                out);
-          });
+        using CTYPE_IN = typename torch::executor::
+            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+        ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
+          CTYPE_IN alpha_val;
+          ET_KERNEL_CHECK(
+              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+          SubInner<
+              can_cast<CTYPE_IN, CTYPE_OUT>::value,
+              CTYPE_A,
+              CTYPE_B,
+              CTYPE_IN,
+              CTYPE_OUT>::run(a, b, alpha_val, out);
         });
       });
     });

From b3c3e6547744cde19b35f2578aa038ee33ca6682 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 9 May 2024 11:52:44 -0700
Subject: [PATCH 17/62] Use compile-time promotion to reduce optimized le op
 size & build time (#3534)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3534

Yet another optimized op.

Reviewed By: manuelcandales

Differential Revision: D57028967

fbshipit-source-id: a8203e8cca86beadf352630893e8822dd022c819
---
 kernels/optimized/cpu/op_le.cpp | 35 ++++++++++++++-------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 05e7889671b..15481403c2d 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -53,31 +53,26 @@ Tensor& opt_le_tensor_out(
               a.numel());
         });
   } else {
-    ScalarType common_type = promoteTypes(a_type, b_type);
     ET_SWITCH_REAL_TYPES_AND(
         Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() {
           ET_SWITCH_REAL_TYPES_AND(
               Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() {
+                using CTYPE_IN = typename torch::executor::
+                    promote_types<CTYPE_A, CTYPE_B>::type;
+                ET_DCHECK(
+                    CppTypeToScalarType<CTYPE_IN>::value ==
+                    promoteTypes(a_type, b_type));
                 ET_SWITCH_REAL_TYPES_AND(
-                    Bool, common_type, ctx, "le.Tensor_out", CTYPE_IN, [&]() {
-                      ET_SWITCH_REAL_TYPES_AND(
-                          Bool,
-                          out_type,
-                          ctx,
-                          "le.Tensor_out",
-                          CTYPE_OUT,
-                          [&]() {
-                            const size_t n = a.numel();
-                            const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
-                            const CTYPE_B* b_data = b.const_data_ptr<CTYPE_B>();
-                            CTYPE_OUT* out_data =
-                                out.mutable_data_ptr<CTYPE_OUT>();
-                            for (auto i = 0; i < n; ++i) {
-                              out_data[i] = static_cast<CTYPE_OUT>(
-                                  static_cast<CTYPE_IN>(a_data[i]) <=
-                                  static_cast<CTYPE_IN>(b_data[i]));
-                            }
-                          });
+                    Bool, out_type, ctx, "le.Tensor_out", CTYPE_OUT, [&]() {
+                      const size_t n = a.numel();
+                      const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
+                      const CTYPE_B* b_data = b.const_data_ptr<CTYPE_B>();
+                      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+                      for (auto i = 0; i < n; ++i) {
+                        out_data[i] = static_cast<CTYPE_OUT>(
+                            static_cast<CTYPE_IN>(a_data[i]) <=
+                            static_cast<CTYPE_IN>(b_data[i]));
+                      }
                     });
               });
         });

From cc1154167b23fa221a33f79e09a1a9629f9ef722 Mon Sep 17 00:00:00 2001
From: David Lin <lind@meta.com>
Date: Thu, 9 May 2024 11:57:47 -0700
Subject: [PATCH 18/62] make extract_delegate_segments=True by default (#3405)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3405

updated all existing callsites to use the previous default value of False.

when extract_delegate_segments is set to False (previous behavior), the backend blob data is part of the flatbuffer serialized program. this leads to higher memory consumption, as backends may not need the input blob post initialization, but cannot free the memory as it's part of the flatbuffer.

when extract_delegate_segments is set to True, the backend blob data is extracted into separate segments. this way, each backend can choose to free the memory after initialization if it is no longer needed. this reduces peak memory consumption as a result. the con is that this leads to an increased program size due to internal padding between the flatbuffer program and the extracted segments

Reviewed By: JacobSzwejbka, cccclai, dbort, zonglinpengmeta

Differential Revision: D56712292

fbshipit-source-id: 0f29972357b3a8288f170ce4a00f0d7b043036e5
---
 backends/apple/mps/test/test_mps_utils.py   |  8 ++++++--
 backends/arm/test/arm_tosa_reference.py     |  4 +++-
 backends/qualcomm/tests/utils.py            |  3 ++-
 examples/apple/mps/scripts/mps_example.py   | 10 ++++++++--
 examples/arm/aot_arm_compiler.py            |  4 +++-
 examples/qualcomm/scripts/export_example.py |  4 +++-
 examples/xnnpack/aot_compiler.py            |  4 +++-
 examples/xnnpack/quantization/example.py    |  4 +++-
 exir/capture/_config.py                     |  2 +-
 exir/program/test/test_program.py           | 20 ++++++++++++--------
 exir/tests/test_memory_planning.py          |  2 +-
 11 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index 08088df7db5..36f11c08c80 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -247,7 +247,9 @@ def lower_module_and_test_output(
             )
 
             executorch_program = delegated_program.to_executorch(
-                config=ExecutorchBackendConfig(extract_constant_segment=False)
+                config=ExecutorchBackendConfig(
+                    extract_delegate_segments=False, extract_constant_segment=False
+                )
             )
         else:
             delegated_program = to_backend(
@@ -264,7 +266,9 @@ def lower_module_and_test_output(
                     _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
                 ),
             ).to_executorch(
-                config=ExecutorchBackendConfig(extract_constant_segment=False)
+                config=ExecutorchBackendConfig(
+                    extract_delegate_segments=False, extract_constant_segment=False
+                )
             )
 
         if bundled_program:
diff --git a/backends/arm/test/arm_tosa_reference.py b/backends/arm/test/arm_tosa_reference.py
index ef6db7db526..f6a7fd97876 100644
--- a/backends/arm/test/arm_tosa_reference.py
+++ b/backends/arm/test/arm_tosa_reference.py
@@ -202,7 +202,9 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
 
     model_edge = model_edge.to_backend(ArmPartitioner(compile_spec))
     exec_prog = model_edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     # Save ground truth results to file
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 59a48f123da..b7390bd42b2 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -187,6 +187,7 @@ def lower_module_and_test_output(
         )
         exec_prog = delegated_program.to_executorch(
             exir.ExecutorchBackendConfig(
+                extract_delegate_segments=False,
                 # For shared buffer, user must pass the memory address
                 # which is allocated by RPC memory to executor runner.
                 # Therefore, won't want to pre-allocate
@@ -195,7 +196,7 @@ def lower_module_and_test_output(
                     memory_planning_algo="greedy",
                     alloc_graph_input=not self.shared_buffer,
                     alloc_graph_output=not self.shared_buffer,
-                )
+                ),
             )
         )
 
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index 0bfef7bf4ce..c6ef6b14c74 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -182,7 +182,9 @@ def get_model_config(args):
         logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
 
         executorch_program = edge.to_executorch(
-            config=ExecutorchBackendConfig(extract_constant_segment=False)
+            config=ExecutorchBackendConfig(
+                extract_delegate_segments=False, extract_constant_segment=False
+            )
         )
     else:
         lowered_module = to_backend(
@@ -192,7 +194,11 @@ def get_model_config(args):
             lowered_module,
             example_inputs,
             edge_compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
-        ).to_executorch(config=ExecutorchBackendConfig(extract_constant_segment=False))
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                extract_delegate_segments=False, extract_constant_segment=False
+            )
+        )
 
     model_name = f"{args.model_name}_mps"
 
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 2c74a829b87..7f30924b7b4 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -211,7 +211,9 @@ def forward(self, x):
         logging.debug(f"Lowered graph:\n{edge.exported_program().graph}")
 
     exec_prog = edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     model_name = f"{args.model_name}" + (
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index a6d2e6d1a3e..98b245c512d 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -96,7 +96,9 @@
         )
 
     executorch_program = delegated_program.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     if args.generate_etrecord:
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index 4ef6852fd28..f23ba5e9c21 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -103,7 +103,9 @@
     logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
 
     exec_prog = edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     if args.etrecord is not None:
diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py
index 4804af0b42e..a47d2180667 100644
--- a/examples/xnnpack/quantization/example.py
+++ b/examples/xnnpack/quantization/example.py
@@ -191,7 +191,9 @@ def main() -> None:
 
     start = time.perf_counter()
     prog = edge_m.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
     save_pte_program(prog, f"{args.model_name}_quantized")
     end = time.perf_counter()
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index fecb2382e27..c03be0e24f3 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -55,7 +55,7 @@ class ExecutorchBackendConfig:
     # Whether to move delegate data blobs from the Program into separate
     # segments, rather than encoding those blobs in the flatbuffer data.
     # This makes it possible to free those blobs at runtime.
-    extract_delegate_segments: bool = False
+    extract_delegate_segments: bool = True
 
     # Whether to extract constants from the Program into separate segments,
     # rather than encoding those constants in the flatbuffer data.
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 01de1f3befd..51f0fcf0788 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -293,9 +293,7 @@ def test_edge_to_backend_replaces_subgraph(self):
         # two delegate blobs for forward and foo
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(
-                    ExecutorchBackendConfig(extract_delegate_segments=True)
-                )
+                delegate_manager.to_executorch(ExecutorchBackendConfig())
                 ._emitter_output.program.execution_plan[0]
                 .delegates
             ),
@@ -303,9 +301,7 @@ def test_edge_to_backend_replaces_subgraph(self):
         )
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(
-                    ExecutorchBackendConfig(extract_delegate_segments=True)
-                )
+                delegate_manager.to_executorch(ExecutorchBackendConfig())
                 ._emitter_output.program.execution_plan[1]
                 .delegates
             ),
@@ -349,7 +345,11 @@ def test_edge_to_backend_selective(self):
         # one delegate blob for forward
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(ExecutorchBackendConfig())
+                delegate_manager.to_executorch(
+                    ExecutorchBackendConfig(
+                        extract_delegate_segments=False,
+                    )
+                )
                 ._emitter_output.program.execution_plan[0]  # foo
                 .delegates
             ),
@@ -357,7 +357,11 @@ def test_edge_to_backend_selective(self):
         )
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(ExecutorchBackendConfig())
+                delegate_manager.to_executorch(
+                    ExecutorchBackendConfig(
+                        extract_delegate_segments=False,
+                    )
+                )
                 ._emitter_output.program.execution_plan[1]  # forward
                 .delegates
             ),
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index 90a6d7b7d8c..12a0583ab41 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -495,7 +495,7 @@ def test_multiple_pools(
                 memory_planning_pass=CustomPoolMemoryPlanningPass(
                     memory_planning_algo=algo,
                     alignment=1,
-                )
+                ),
             )
         )
         graph_module = edge_program.exported_program().graph_module

From edaae14f68fa8216b7ba9530fad25c0d8097d3a9 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Thu, 9 May 2024 12:22:59 -0700
Subject: [PATCH 19/62] Split `Resource.h` into multiple files within `memory/`
 (#3553)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3553

This change is a no-op and simply refactors existing classes.

I'm trying to learn what's going on within this `api/` folder.

`Resource.*` files can be split to be less intimidating to readers. I'm also thinking we can flatten the hierarchy more in future changes.
ghstack-source-id: 225755561
exported-using-ghexport
bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: SS-JIA

Differential Revision: D57126893

fbshipit-source-id: 0a5f729c70351a62647af70225d8288faa5af035
---
 backends/vulkan/runtime/api/Adapter.h         |   6 +-
 backends/vulkan/runtime/api/Command.h         |   4 +-
 backends/vulkan/runtime/api/Context.h         |   4 +-
 backends/vulkan/runtime/api/Descriptor.h      |   4 +-
 backends/vulkan/runtime/api/Exception.h       |   7 +-
 backends/vulkan/runtime/api/Fence.cpp         |  76 ++
 backends/vulkan/runtime/api/Fence.h           |  98 ++
 backends/vulkan/runtime/api/Pipeline.h        |   4 +-
 backends/vulkan/runtime/api/Resource.cpp      | 838 ------------------
 backends/vulkan/runtime/api/Resource.h        | 599 -------------
 backends/vulkan/runtime/api/Tensor.cpp        |   2 +-
 backends/vulkan/runtime/api/Tensor.h          |   2 +-
 backends/vulkan/runtime/api/api.h             |   7 +-
 .../vulkan/runtime/api/memory/Allocation.cpp  |  74 ++
 .../vulkan/runtime/api/memory/Allocation.h    |  56 ++
 .../vulkan/runtime/api/memory/Allocator.cpp   | 190 ++++
 .../vulkan/runtime/api/memory/Allocator.h     | 110 +++
 backends/vulkan/runtime/api/memory/Buffer.cpp | 194 ++++
 backends/vulkan/runtime/api/memory/Buffer.h   | 174 ++++
 backends/vulkan/runtime/api/memory/Image.cpp  | 336 +++++++
 backends/vulkan/runtime/api/memory/Image.h    | 253 ++++++
 .../runtime/api/{ => memory}/vma_api.cpp      |   2 +-
 .../vulkan/runtime/api/{ => memory}/vma_api.h |   0
 .../runtime/graph/containers/SharedObject.h   |   2 +-
 backends/vulkan/targets.bzl                   |   4 +-
 backends/vulkan/test/utils/test_utils.cpp     |   2 +-
 backends/vulkan/test/utils/test_utils.h       |   2 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   |  24 +-
 28 files changed, 1607 insertions(+), 1467 deletions(-)
 create mode 100644 backends/vulkan/runtime/api/Fence.cpp
 create mode 100644 backends/vulkan/runtime/api/Fence.h
 delete mode 100644 backends/vulkan/runtime/api/Resource.cpp
 delete mode 100644 backends/vulkan/runtime/api/Resource.h
 create mode 100644 backends/vulkan/runtime/api/memory/Allocation.cpp
 create mode 100644 backends/vulkan/runtime/api/memory/Allocation.h
 create mode 100644 backends/vulkan/runtime/api/memory/Allocator.cpp
 create mode 100644 backends/vulkan/runtime/api/memory/Allocator.h
 create mode 100644 backends/vulkan/runtime/api/memory/Buffer.cpp
 create mode 100644 backends/vulkan/runtime/api/memory/Buffer.h
 create mode 100644 backends/vulkan/runtime/api/memory/Image.cpp
 create mode 100644 backends/vulkan/runtime/api/memory/Image.h
 rename backends/vulkan/runtime/api/{ => memory}/vma_api.cpp (78%)
 rename backends/vulkan/runtime/api/{ => memory}/vma_api.h (100%)

diff --git a/backends/vulkan/runtime/api/Adapter.h b/backends/vulkan/runtime/api/Adapter.h
index b038aea9fa8..ef246260021 100644
--- a/backends/vulkan/runtime/api/Adapter.h
+++ b/backends/vulkan/runtime/api/Adapter.h
@@ -16,6 +16,8 @@
 #include <executorch/backends/vulkan/runtime/api/Shader.h>
 #include <executorch/backends/vulkan/runtime/api/Utils.h>
 
+#include <executorch/backends/vulkan/runtime/api/memory/Allocator.h>
+
 #include <array>
 #include <mutex>
 #include <ostream>
@@ -136,7 +138,7 @@ class Adapter final {
   ComputePipelineCache compute_pipeline_cache_;
   // Memory Management
   SamplerCache sampler_cache_;
-  MemoryAllocator vma_;
+  Allocator vma_;
 
  public:
   // Physical Device metadata
@@ -194,7 +196,7 @@ class Adapter final {
     return sampler_cache_;
   }
 
-  inline MemoryAllocator& vma() {
+  inline Allocator& vma() {
     return vma_;
   }
 
diff --git a/backends/vulkan/runtime/api/Command.h b/backends/vulkan/runtime/api/Command.h
index 877f84d4cec..85d859c0702 100644
--- a/backends/vulkan/runtime/api/Command.h
+++ b/backends/vulkan/runtime/api/Command.h
@@ -14,10 +14,12 @@
 
 #include <executorch/backends/vulkan/runtime/api/Descriptor.h>
 #include <executorch/backends/vulkan/runtime/api/Pipeline.h>
-#include <executorch/backends/vulkan/runtime/api/Resource.h>
 #include <executorch/backends/vulkan/runtime/api/Shader.h>
 #include <executorch/backends/vulkan/runtime/api/Utils.h>
 
+#include <executorch/backends/vulkan/runtime/api/memory/Buffer.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Image.h>
+
 namespace vkcompute {
 namespace api {
 
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index bddb9e1ae17..5bc4506628b 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -15,13 +15,15 @@
 #include <executorch/backends/vulkan/runtime/api/Adapter.h>
 #include <executorch/backends/vulkan/runtime/api/Command.h>
 #include <executorch/backends/vulkan/runtime/api/Descriptor.h>
+#include <executorch/backends/vulkan/runtime/api/Fence.h>
 #include <executorch/backends/vulkan/runtime/api/Pipeline.h>
 #include <executorch/backends/vulkan/runtime/api/QueryPool.h>
-#include <executorch/backends/vulkan/runtime/api/Resource.h>
 #include <executorch/backends/vulkan/runtime/api/Runtime.h>
 #include <executorch/backends/vulkan/runtime/api/Shader.h>
 #include <executorch/backends/vulkan/runtime/api/Utils.h>
 
+#include <executorch/backends/vulkan/runtime/api/memory/Buffer.h>
+
 namespace vkcompute {
 namespace api {
 
diff --git a/backends/vulkan/runtime/api/Descriptor.h b/backends/vulkan/runtime/api/Descriptor.h
index 915a5b824c1..e1b40fbd173 100644
--- a/backends/vulkan/runtime/api/Descriptor.h
+++ b/backends/vulkan/runtime/api/Descriptor.h
@@ -12,9 +12,11 @@
 
 #include <executorch/backends/vulkan/runtime/api/vk_api.h>
 
-#include <executorch/backends/vulkan/runtime/api/Resource.h>
 #include <executorch/backends/vulkan/runtime/api/Shader.h>
 
+#include <executorch/backends/vulkan/runtime/api/memory/Buffer.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Image.h>
+
 #include <unordered_map>
 
 namespace vkcompute {
diff --git a/backends/vulkan/runtime/api/Exception.h b/backends/vulkan/runtime/api/Exception.h
index 28ee096984b..05dc10ee953 100644
--- a/backends/vulkan/runtime/api/Exception.h
+++ b/backends/vulkan/runtime/api/Exception.h
@@ -9,14 +9,15 @@
 #pragma once
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
+#include <executorch/backends/vulkan/runtime/api/vk_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/StringUtil.h>
+
 #include <exception>
 #include <ostream>
 #include <string>
 #include <vector>
 
-#include <executorch/backends/vulkan/runtime/api/StringUtil.h>
-#include <executorch/backends/vulkan/runtime/api/vk_api.h>
-
 #define VK_CHECK(function)                                                \
   do {                                                                    \
     const VkResult result = (function);                                   \
diff --git a/backends/vulkan/runtime/api/Fence.cpp b/backends/vulkan/runtime/api/Fence.cpp
new file mode 100644
index 00000000000..6253a5e13e1
--- /dev/null
+++ b/backends/vulkan/runtime/api/Fence.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/api/Fence.h>
+
+namespace vkcompute {
+namespace api {
+
+VulkanFence::VulkanFence()
+    : device_(VK_NULL_HANDLE), handle_(VK_NULL_HANDLE), waiting_(false) {}
+
+VulkanFence::VulkanFence(VkDevice device)
+    : device_(device), handle_(VK_NULL_HANDLE), waiting_(VK_NULL_HANDLE) {
+  const VkFenceCreateInfo fence_create_info{
+      VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, // sType
+      nullptr, // pNext
+      0u, // flags
+  };
+
+  VK_CHECK(vkCreateFence(device_, &fence_create_info, nullptr, &handle_));
+}
+
+VulkanFence::VulkanFence(VulkanFence&& other) noexcept
+    : device_(other.device_), handle_(other.handle_), waiting_(other.waiting_) {
+  other.handle_ = VK_NULL_HANDLE;
+  other.waiting_ = false;
+}
+
+VulkanFence& VulkanFence::operator=(VulkanFence&& other) noexcept {
+  device_ = other.device_;
+  handle_ = other.handle_;
+  waiting_ = other.waiting_;
+
+  other.device_ = VK_NULL_HANDLE;
+  other.handle_ = VK_NULL_HANDLE;
+  other.waiting_ = false;
+
+  return *this;
+}
+
+VulkanFence::~VulkanFence() {
+  if (VK_NULL_HANDLE == handle_) {
+    return;
+  }
+  vkDestroyFence(device_, handle_, nullptr);
+}
+
+void VulkanFence::wait() {
+  // if get_submit_handle() has not been called, then this will no-op
+  if (waiting_) {
+    VkResult fence_status = VK_NOT_READY;
+    // Run the wait in a loop to keep the CPU hot. A single call to
+    // vkWaitForFences with no timeout may cause the calling thread to be
+    // scheduled out.
+    do {
+      // The timeout (last) arg is in units of ns
+      fence_status = vkWaitForFences(device_, 1u, &handle_, VK_TRUE, 100000);
+
+      VK_CHECK_COND(
+          fence_status != VK_ERROR_DEVICE_LOST,
+          "Vulkan Fence: Device lost while waiting for fence!");
+    } while (fence_status != VK_SUCCESS);
+
+    VK_CHECK(vkResetFences(device_, 1u, &handle_));
+
+    waiting_ = false;
+  }
+}
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Fence.h b/backends/vulkan/runtime/api/Fence.h
new file mode 100644
index 00000000000..613a24aaec5
--- /dev/null
+++ b/backends/vulkan/runtime/api/Fence.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+
+#include <executorch/backends/vulkan/runtime/api/vk_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/Exception.h>
+
+#include <stack>
+
+namespace vkcompute {
+namespace api {
+
+class VulkanFence final {
+ public:
+  // TODO: This is required for the lazy allocation pattern in api/Tensor.
+  //       It will be disabled pending future refactors.
+  explicit VulkanFence();
+
+  explicit VulkanFence(VkDevice);
+
+  VulkanFence(const VulkanFence&) = delete;
+  VulkanFence& operator=(const VulkanFence&) = delete;
+
+  VulkanFence(VulkanFence&&) noexcept;
+  VulkanFence& operator=(VulkanFence&&) noexcept;
+
+  ~VulkanFence();
+
+ private:
+  VkDevice device_;
+  VkFence handle_;
+  bool waiting_;
+
+ public:
+  // Used to get the handle for a queue submission.
+  VkFence get_submit_handle() {
+    if (handle_ != VK_NULL_HANDLE) {
+      // Indicate we are now waiting for this fence to be signaled
+      waiting_ = true;
+    }
+    return handle_;
+  }
+
+  VkFence handle() {
+    return handle_;
+  }
+
+  // Trigger a synchronous wait for the fence to be signaled
+  void wait();
+
+  bool waiting() const {
+    return waiting_;
+  }
+
+  operator bool() const {
+    return (VK_NULL_HANDLE != handle_);
+  }
+};
+
+// A pool to track created Fences and reuse ones that are available.
+// Only intended to be modified by one thread at a time.
+struct FencePool final {
+  VkDevice device_;
+
+  std::stack<VulkanFence> pool_;
+
+  explicit FencePool(VkDevice device) : device_(device), pool_{} {}
+
+  // Returns an rvalue reference to a fence, so that it can be moved
+  inline VulkanFence get_fence() {
+    if (pool_.empty()) {
+      VulkanFence new_fence = VulkanFence(device_);
+      return new_fence;
+    }
+
+    VulkanFence top_fence = std::move(pool_.top());
+    pool_.pop();
+
+    return top_fence;
+  }
+
+  // Marks the fence as available
+  inline void return_fence(VulkanFence& fence) {
+    pool_.push(std::move(fence));
+  }
+};
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Pipeline.h b/backends/vulkan/runtime/api/Pipeline.h
index b8c16efd910..351c8be918a 100644
--- a/backends/vulkan/runtime/api/Pipeline.h
+++ b/backends/vulkan/runtime/api/Pipeline.h
@@ -12,9 +12,11 @@
 
 #include <executorch/backends/vulkan/runtime/api/vk_api.h>
 
-#include <executorch/backends/vulkan/runtime/api/Resource.h>
 #include <executorch/backends/vulkan/runtime/api/Shader.h>
 
+#include <executorch/backends/vulkan/runtime/api/memory/Buffer.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Image.h>
+
 #include <mutex>
 #include <unordered_map>
 
diff --git a/backends/vulkan/runtime/api/Resource.cpp b/backends/vulkan/runtime/api/Resource.cpp
deleted file mode 100644
index d15dfc05275..00000000000
--- a/backends/vulkan/runtime/api/Resource.cpp
+++ /dev/null
@@ -1,838 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/Adapter.h>
-#include <executorch/backends/vulkan/runtime/api/Resource.h>
-
-#define PRINT_FIELD(struct, field) #field << ": " << struct.field << std::endl
-
-std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats) {
-  VmaDetailedStatistics total_stats = stats.total;
-  out << "VmaTotalStatistics: " << std::endl;
-  out << "  " << PRINT_FIELD(total_stats.statistics, blockCount);
-  out << "  " << PRINT_FIELD(total_stats.statistics, allocationCount);
-  out << "  " << PRINT_FIELD(total_stats.statistics, blockBytes);
-  out << "  " << PRINT_FIELD(total_stats.statistics, allocationBytes);
-  return out;
-}
-
-#undef PRINT_FIELD
-
-namespace vkcompute {
-namespace api {
-
-//
-// MemoryBarrier
-//
-
-MemoryBarrier::MemoryBarrier(
-    const VkAccessFlags src_access_flags,
-    const VkAccessFlags dst_access_flags)
-    : handle{
-          VK_STRUCTURE_TYPE_MEMORY_BARRIER, // sType
-          nullptr, // pNext
-          src_access_flags, // srcAccessMask
-          dst_access_flags, // dstAccessMask
-      } {}
-
-//
-// MemoryAllocation
-//
-
-MemoryAllocation::MemoryAllocation()
-    : memory_requirements{},
-      create_info{},
-      allocator(VK_NULL_HANDLE),
-      allocation(VK_NULL_HANDLE) {}
-
-MemoryAllocation::MemoryAllocation(
-    VmaAllocator vma_allocator,
-    const VkMemoryRequirements& mem_props,
-    const VmaAllocationCreateInfo& create_info)
-    : memory_requirements(mem_props),
-      create_info(create_info),
-      allocator(vma_allocator),
-      allocation(VK_NULL_HANDLE) {
-  VK_CHECK(vmaAllocateMemory(
-      allocator, &memory_requirements, &create_info, &allocation, nullptr));
-}
-
-MemoryAllocation::MemoryAllocation(MemoryAllocation&& other) noexcept
-    : memory_requirements(other.memory_requirements),
-      create_info(other.create_info),
-      allocator(other.allocator),
-      allocation(other.allocation) {
-  other.allocation = VK_NULL_HANDLE;
-}
-
-MemoryAllocation& MemoryAllocation::operator=(
-    MemoryAllocation&& other) noexcept {
-  VmaAllocation tmp_allocation = allocation;
-
-  memory_requirements = other.memory_requirements;
-  create_info = other.create_info;
-  allocator = other.allocator;
-  allocation = other.allocation;
-
-  other.allocation = tmp_allocation;
-
-  return *this;
-}
-
-MemoryAllocation::~MemoryAllocation() {
-  if (VK_NULL_HANDLE != allocation) {
-    vmaFreeMemory(allocator, allocation);
-  }
-}
-
-//
-// VulkanBuffer
-//
-
-VulkanBuffer::VulkanBuffer()
-    : buffer_properties_{},
-      allocator_(VK_NULL_HANDLE),
-      memory_{},
-      owns_memory_(false),
-      handle_(VK_NULL_HANDLE) {}
-
-VulkanBuffer::VulkanBuffer(
-    VmaAllocator vma_allocator,
-    const VkDeviceSize size,
-    const VmaAllocationCreateInfo& allocation_create_info,
-    const VkBufferUsageFlags usage,
-    const bool allocate_memory)
-    : buffer_properties_({
-          size,
-          0u,
-          size,
-          usage,
-      }),
-      allocator_(vma_allocator),
-      memory_{},
-      owns_memory_(allocate_memory),
-      handle_(VK_NULL_HANDLE) {
-  // Only allocate memory if the buffer has non-zero size
-  if (size == 0) {
-    return;
-  }
-
-  const VkBufferCreateInfo buffer_create_info{
-      VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      size, // size
-      buffer_properties_.buffer_usage, // usage
-      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
-      0u, // queueFamilyIndexCount
-      nullptr, // pQueueFamilyIndices
-  };
-
-  memory_.create_info = allocation_create_info;
-
-  if (allocate_memory) {
-    VK_CHECK(vmaCreateBuffer(
-        allocator_,
-        &buffer_create_info,
-        &allocation_create_info,
-        &handle_,
-        &(memory_.allocation),
-        nullptr));
-  } else {
-    VmaAllocatorInfo allocator_info{};
-    vmaGetAllocatorInfo(allocator_, &allocator_info);
-    VK_CHECK(vkCreateBuffer(
-        allocator_info.device, &buffer_create_info, nullptr, &handle_));
-  }
-}
-
-VulkanBuffer::VulkanBuffer(VulkanBuffer&& other) noexcept
-    : buffer_properties_(other.buffer_properties_),
-      allocator_(other.allocator_),
-      memory_(std::move(other.memory_)),
-      owns_memory_(other.owns_memory_),
-      handle_(other.handle_) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-VulkanBuffer& VulkanBuffer::operator=(VulkanBuffer&& other) noexcept {
-  VkBuffer tmp_buffer = handle_;
-  bool tmp_owns_memory = owns_memory_;
-
-  buffer_properties_ = other.buffer_properties_;
-  allocator_ = other.allocator_;
-  memory_ = std::move(other.memory_);
-  owns_memory_ = other.owns_memory_;
-  handle_ = other.handle_;
-
-  other.handle_ = tmp_buffer;
-  other.owns_memory_ = tmp_owns_memory;
-
-  return *this;
-}
-
-VulkanBuffer::~VulkanBuffer() {
-  if (VK_NULL_HANDLE != handle_) {
-    if (owns_memory_) {
-      vmaDestroyBuffer(allocator_, handle_, memory_.allocation);
-    } else {
-      vkDestroyBuffer(this->device(), handle_, nullptr);
-    }
-    // Prevent the underlying memory allocation from being freed; it was either
-    // freed by vmaDestroyBuffer, or this resource does not own the underlying
-    // memory
-    memory_.allocation = VK_NULL_HANDLE;
-  }
-}
-
-VkMemoryRequirements VulkanBuffer::get_memory_requirements() const {
-  VkMemoryRequirements memory_requirements;
-  vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements);
-  return memory_requirements;
-}
-
-//
-// MemoryMap
-//
-
-MemoryMap::MemoryMap(const VulkanBuffer& buffer, const uint8_t access)
-    : access_(access),
-      allocator_(buffer.vma_allocator()),
-      allocation_(buffer.allocation()),
-      data_(nullptr),
-      data_len_{buffer.mem_size()} {
-  if (allocation_) {
-    VK_CHECK(vmaMapMemory(allocator_, allocation_, &data_));
-  }
-}
-
-MemoryMap::MemoryMap(MemoryMap&& other) noexcept
-    : access_(other.access_),
-      allocator_(other.allocator_),
-      allocation_(other.allocation_),
-      data_(other.data_),
-      data_len_{other.data_len_} {
-  other.allocation_ = VK_NULL_HANDLE;
-  other.data_ = nullptr;
-}
-
-MemoryMap::~MemoryMap() {
-  if (!data_) {
-    return;
-  }
-
-  if (allocation_) {
-    if (access_ & MemoryAccessType::WRITE) {
-      // Call will be ignored by implementation if the memory type this
-      // allocation belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is
-      // the behavior we want. Don't check the result here as the destructor
-      // cannot throw.
-      vmaFlushAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE);
-    }
-
-    vmaUnmapMemory(allocator_, allocation_);
-  }
-}
-
-void MemoryMap::invalidate() {
-  if (access_ & MemoryAccessType::READ && allocation_) {
-    // Call will be ignored by implementation if the memory type this allocation
-    // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior
-    // we want.
-    VK_CHECK(
-        vmaInvalidateAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE));
-  }
-}
-
-//
-// BufferMemoryBarrier
-//
-
-BufferMemoryBarrier::BufferMemoryBarrier(
-    const VkAccessFlags src_access_flags,
-    const VkAccessFlags dst_access_flags,
-    const VulkanBuffer& buffer)
-    : handle{
-          VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, // sType
-          nullptr, // pNext
-          src_access_flags, // srcAccessMask
-          dst_access_flags, // dstAccessMask
-          VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex
-          VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex
-          buffer.handle_, // buffer
-          buffer.buffer_properties_.mem_offset, // offset
-          buffer.buffer_properties_.mem_range, // size
-      } {}
-
-//
-// ImageSampler
-//
-
-bool operator==(
-    const ImageSampler::Properties& _1,
-    const ImageSampler::Properties& _2) {
-  return (
-      _1.filter == _2.filter && _1.mipmap_mode == _2.mipmap_mode &&
-      _1.address_mode == _2.address_mode && _1.border_color == _2.border_color);
-}
-
-ImageSampler::ImageSampler(
-    VkDevice device,
-    const ImageSampler::Properties& props)
-    : device_(device), handle_(VK_NULL_HANDLE) {
-  const VkSamplerCreateInfo sampler_create_info{
-      VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      props.filter, // magFilter
-      props.filter, // minFilter
-      props.mipmap_mode, // mipmapMode
-      props.address_mode, // addressModeU
-      props.address_mode, // addressModeV
-      props.address_mode, // addressModeW
-      0.0f, // mipLodBias
-      VK_FALSE, // anisotropyEnable
-      1.0f, // maxAnisotropy,
-      VK_FALSE, // compareEnable
-      VK_COMPARE_OP_NEVER, // compareOp
-      0.0f, // minLod
-      VK_LOD_CLAMP_NONE, // maxLod
-      props.border_color, // borderColor
-      VK_FALSE, // unnormalizedCoordinates
-  };
-
-  VK_CHECK(vkCreateSampler(device_, &sampler_create_info, nullptr, &handle_));
-}
-
-ImageSampler::ImageSampler(ImageSampler&& other) noexcept
-    : device_(other.device_), handle_(other.handle_) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-ImageSampler::~ImageSampler() {
-  if (VK_NULL_HANDLE == handle_) {
-    return;
-  }
-  vkDestroySampler(device_, handle_, nullptr);
-}
-
-size_t ImageSampler::Hasher::operator()(
-    const ImageSampler::Properties& props) const {
-  size_t seed = 0;
-  seed = utils::hash_combine(seed, std::hash<VkFilter>()(props.filter));
-  seed = utils::hash_combine(
-      seed, std::hash<VkSamplerMipmapMode>()(props.mipmap_mode));
-  seed = utils::hash_combine(
-      seed, std::hash<VkSamplerAddressMode>()(props.address_mode));
-  seed =
-      utils::hash_combine(seed, std::hash<VkBorderColor>()(props.border_color));
-  return seed;
-}
-
-void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept {
-  VkDevice tmp_device = lhs.device_;
-  VkSampler tmp_handle = lhs.handle_;
-
-  lhs.device_ = rhs.device_;
-  lhs.handle_ = rhs.handle_;
-
-  rhs.device_ = tmp_device;
-  rhs.handle_ = tmp_handle;
-}
-
-//
-// VulkanImage
-//
-
-VulkanImage::VulkanImage()
-    : image_properties_{},
-      view_properties_{},
-      sampler_properties_{},
-      allocator_(VK_NULL_HANDLE),
-      memory_{},
-      owns_memory_(false),
-      handles_{
-          VK_NULL_HANDLE,
-          VK_NULL_HANDLE,
-          VK_NULL_HANDLE,
-      },
-      layout_{} {}
-
-VulkanImage::VulkanImage(
-    VmaAllocator vma_allocator,
-    const VmaAllocationCreateInfo& allocation_create_info,
-    const ImageProperties& image_props,
-    const ViewProperties& view_props,
-    const SamplerProperties& sampler_props,
-    const VkImageLayout layout,
-    VkSampler sampler,
-    const bool allocate_memory)
-    : image_properties_(image_props),
-      view_properties_(view_props),
-      sampler_properties_(sampler_props),
-      allocator_(vma_allocator),
-      memory_{},
-      owns_memory_{allocate_memory},
-      handles_{
-          VK_NULL_HANDLE,
-          VK_NULL_HANDLE,
-          sampler,
-      },
-      layout_(layout) {
-  VmaAllocatorInfo allocator_info{};
-  vmaGetAllocatorInfo(allocator_, &allocator_info);
-
-  // If any dims are zero, then no memory will be allocated for the image.
-  if (image_props.image_extents.width == 0 ||
-      image_props.image_extents.height == 0 ||
-      image_props.image_extents.depth == 0) {
-    return;
-  }
-
-  const VkImageCreateInfo image_create_info{
-      VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      image_properties_.image_type, // imageType
-      image_properties_.image_format, // format
-      image_properties_.image_extents, // extents
-      1u, // mipLevels
-      1u, // arrayLayers
-      VK_SAMPLE_COUNT_1_BIT, // samples
-      VK_IMAGE_TILING_OPTIMAL, // tiling
-      image_properties_.image_usage, // usage
-      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
-      0u, // queueFamilyIndexCount
-      nullptr, // pQueueFamilyIndices
-      layout_, // initialLayout
-  };
-
-  memory_.create_info = allocation_create_info;
-
-  if (allocate_memory) {
-    VK_CHECK(vmaCreateImage(
-        allocator_,
-        &image_create_info,
-        &allocation_create_info,
-        &(handles_.image),
-        &(memory_.allocation),
-        nullptr));
-    // Only create the image view if the image has been bound to memory
-    create_image_view();
-  } else {
-    VK_CHECK(vkCreateImage(
-        allocator_info.device, &image_create_info, nullptr, &(handles_.image)));
-  }
-}
-
-VulkanImage::VulkanImage(VulkanImage&& other) noexcept
-    : image_properties_(other.image_properties_),
-      view_properties_(other.view_properties_),
-      sampler_properties_(other.sampler_properties_),
-      allocator_(other.allocator_),
-      memory_(std::move(other.memory_)),
-      owns_memory_(other.owns_memory_),
-      handles_(other.handles_),
-      layout_(other.layout_) {
-  other.handles_.image = VK_NULL_HANDLE;
-  other.handles_.image_view = VK_NULL_HANDLE;
-  other.handles_.sampler = VK_NULL_HANDLE;
-  other.owns_memory_ = false;
-}
-
-VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
-  VkImage tmp_image = handles_.image;
-  VkImageView tmp_image_view = handles_.image_view;
-  bool tmp_owns_memory = owns_memory_;
-
-  image_properties_ = other.image_properties_;
-  view_properties_ = other.view_properties_;
-  sampler_properties_ = other.sampler_properties_;
-  allocator_ = other.allocator_;
-  memory_ = std::move(other.memory_);
-  owns_memory_ = other.owns_memory_;
-  handles_ = other.handles_;
-  layout_ = other.layout_;
-
-  other.handles_.image = tmp_image;
-  other.handles_.image_view = tmp_image_view;
-  other.owns_memory_ = tmp_owns_memory;
-
-  return *this;
-}
-
-VulkanImage::~VulkanImage() {
-  if (VK_NULL_HANDLE != handles_.image_view) {
-    vkDestroyImageView(this->device(), handles_.image_view, nullptr);
-  }
-
-  if (VK_NULL_HANDLE != handles_.image) {
-    if (owns_memory_) {
-      vmaDestroyImage(allocator_, handles_.image, memory_.allocation);
-    } else {
-      vkDestroyImage(this->device(), handles_.image, nullptr);
-    }
-    // Prevent the underlying memory allocation from being freed; it was either
-    // freed by vmaDestroyImage, or this resource does not own the underlying
-    // memory
-    memory_.allocation = VK_NULL_HANDLE;
-  }
-}
-
-void VulkanImage::create_image_view() {
-  VmaAllocatorInfo allocator_info{};
-  vmaGetAllocatorInfo(allocator_, &allocator_info);
-
-  const VkComponentMapping component_mapping{
-      VK_COMPONENT_SWIZZLE_IDENTITY, // r
-      VK_COMPONENT_SWIZZLE_IDENTITY, // g
-      VK_COMPONENT_SWIZZLE_IDENTITY, // b
-      VK_COMPONENT_SWIZZLE_IDENTITY, // a
-  };
-
-  const VkImageSubresourceRange subresource_range{
-      VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
-      0u, // baseMipLevel
-      VK_REMAINING_MIP_LEVELS, // levelCount
-      0u, // baseArrayLayer
-      VK_REMAINING_ARRAY_LAYERS, // layerCount
-  };
-
-  const VkImageViewCreateInfo image_view_create_info{
-      VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      handles_.image, // image
-      view_properties_.view_type, // viewType
-      view_properties_.view_format, // format
-      component_mapping, // components
-      subresource_range, // subresourceRange
-  };
-
-  VK_CHECK(vkCreateImageView(
-      allocator_info.device,
-      &(image_view_create_info),
-      nullptr,
-      &(handles_.image_view)));
-}
-
-VkMemoryRequirements VulkanImage::get_memory_requirements() const {
-  VkMemoryRequirements memory_requirements;
-  vkGetImageMemoryRequirements(
-      this->device(), handles_.image, &memory_requirements);
-  return memory_requirements;
-}
-
-//
-// ImageMemoryBarrier
-//
-
-ImageMemoryBarrier::ImageMemoryBarrier(
-    const VkAccessFlags src_access_flags,
-    const VkAccessFlags dst_access_flags,
-    const VkImageLayout src_layout_flags,
-    const VkImageLayout dst_layout_flags,
-    const VulkanImage& image)
-    : handle{
-          VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, // sType
-          nullptr, // pNext
-          src_access_flags, // srcAccessMask
-          dst_access_flags, // dstAccessMask
-          src_layout_flags, // oldLayout
-          dst_layout_flags, // newLayout
-          VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex
-          VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex
-          image.handles_.image, // image
-          {
-              // subresourceRange
-              VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
-              0u, // baseMipLevel
-              VK_REMAINING_MIP_LEVELS, // levelCount
-              0u, // baseArrayLayer
-              VK_REMAINING_ARRAY_LAYERS, // layerCount
-          },
-      } {}
-
-//
-// SamplerCache
-//
-
-SamplerCache::SamplerCache(VkDevice device)
-    : cache_mutex_{}, device_(device), cache_{} {}
-
-SamplerCache::SamplerCache(SamplerCache&& other) noexcept
-    : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) {
-  std::lock_guard<std::mutex> lock(other.cache_mutex_);
-}
-
-SamplerCache::~SamplerCache() {
-  purge();
-}
-
-VkSampler SamplerCache::retrieve(const SamplerCache::Key& key) {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  auto it = cache_.find(key);
-  if (cache_.cend() == it) {
-    it = cache_.insert({key, SamplerCache::Value(device_, key)}).first;
-  }
-
-  return it->second.handle();
-}
-
-void SamplerCache::purge() {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-  cache_.clear();
-}
-
-//
-// MemoryAllocator
-//
-
-MemoryAllocator::MemoryAllocator(
-    VkInstance instance,
-    VkPhysicalDevice physical_device,
-    VkDevice device)
-    : instance_{},
-      physical_device_(physical_device),
-      device_(device),
-      allocator_{VK_NULL_HANDLE} {
-  VmaVulkanFunctions vk_functions{};
-  vk_functions.vkGetInstanceProcAddr = vkGetInstanceProcAddr;
-  vk_functions.vkGetDeviceProcAddr = vkGetDeviceProcAddr;
-
-  const VmaAllocatorCreateInfo allocator_create_info{
-      0u, // flags
-      physical_device_, // physicalDevice
-      device_, // device
-      0u, // preferredLargeHeapBlockSize
-      nullptr, // pAllocationCallbacks
-      nullptr, // pDeviceMemoryCallbacks
-      nullptr, // pHeapSizeLimit
-      &vk_functions, // pVulkanFunctions
-      instance, // instance
-      VK_API_VERSION_1_0, // vulkanApiVersion
-      nullptr, // pTypeExternalMemoryHandleTypes
-  };
-
-  VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator_));
-}
-
-MemoryAllocator::MemoryAllocator(MemoryAllocator&& other) noexcept
-    : instance_(other.instance_),
-      physical_device_(other.physical_device_),
-      device_(other.device_),
-      allocator_(other.allocator_) {
-  other.allocator_ = VK_NULL_HANDLE;
-  other.device_ = VK_NULL_HANDLE;
-  other.physical_device_ = VK_NULL_HANDLE;
-  other.instance_ = VK_NULL_HANDLE;
-}
-
-MemoryAllocator::~MemoryAllocator() {
-  if (VK_NULL_HANDLE == allocator_) {
-    return;
-  }
-  vmaDestroyAllocator(allocator_);
-}
-
-MemoryAllocation MemoryAllocator::create_allocation(
-    const VkMemoryRequirements& memory_requirements,
-    const VmaAllocationCreateInfo& create_info) {
-  VmaAllocationCreateInfo alloc_create_info = create_info;
-  // Protect against using VMA_MEMORY_USAGE_AUTO_* flags when allocating memory
-  // directly, since those usage flags require that VkBufferCreateInfo and/or
-  // VkImageCreateInfo also be available.
-  switch (create_info.usage) {
-    // The logic for the below usage options are too complex, therefore prevent
-    // those from being used with direct memory allocation.
-    case VMA_MEMORY_USAGE_AUTO:
-    case VMA_MEMORY_USAGE_AUTO_PREFER_HOST:
-      VK_THROW(
-          "Only the VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE usage flag is compatible with create_allocation()");
-      break;
-    // Most of the time, VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE will simply set the
-    // DEVICE_LOCAL_BIT as a preferred memory flag. Therefore the below is a
-    // decent approximation for VMA behaviour.
-    case VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE:
-      alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
-      alloc_create_info.usage = VMA_MEMORY_USAGE_UNKNOWN;
-      break;
-    default:
-      break;
-  }
-
-  return MemoryAllocation(allocator_, memory_requirements, alloc_create_info);
-}
-
-VulkanImage MemoryAllocator::create_image(
-    const VkExtent3D& extents,
-    const VkFormat image_format,
-    const VkImageType image_type,
-    const VkImageViewType image_view_type,
-    const VulkanImage::SamplerProperties& sampler_props,
-    VkSampler sampler,
-    const bool allow_transfer,
-    const bool allocate_memory) {
-  VkImageUsageFlags usage =
-      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT;
-  if (allow_transfer) {
-    usage |=
-        (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
-  }
-
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
-
-  const VulkanImage::ImageProperties image_props{
-      image_type,
-      image_format,
-      extents,
-      usage,
-  };
-
-  const VulkanImage::ViewProperties view_props{
-      image_view_type,
-      image_format,
-  };
-
-  const VkImageLayout initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-
-  return VulkanImage(
-      allocator_,
-      alloc_create_info,
-      image_props,
-      view_props,
-      sampler_props,
-      initial_layout,
-      sampler,
-      allocate_memory);
-}
-
-VulkanBuffer MemoryAllocator::create_storage_buffer(
-    const VkDeviceSize size,
-    const bool gpu_only,
-    const bool allocate_memory) {
-  const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
-
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
-
-  // The create storage buffer will be accessed by both the CPU and GPU, so set
-  // the appropriate flags to indicate that the host device will be accessing
-  // the data from this buffer.
-  if (!gpu_only) {
-    // Deferred memory allocation should only be used for GPU only buffers.
-    VK_CHECK_COND(
-        allocate_memory,
-        "Only GPU-only buffers should use deferred memory allocation");
-
-    alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
-    alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-    alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-    alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-        VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-  }
-
-  return VulkanBuffer(
-      allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
-}
-
-VulkanBuffer MemoryAllocator::create_staging_buffer(const VkDeviceSize size) {
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-
-  VkBufferUsageFlags buffer_usage =
-      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
-
-  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
-}
-
-VulkanBuffer MemoryAllocator::create_uniform_buffer(const VkDeviceSize size) {
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY |
-      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO;
-
-  VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-
-  VulkanBuffer uniform_buffer(
-      allocator_, size, alloc_create_info, buffer_usage);
-  return uniform_buffer;
-}
-
-//
-// VulkanFence
-//
-
-VulkanFence::VulkanFence()
-    : device_(VK_NULL_HANDLE), handle_(VK_NULL_HANDLE), waiting_(false) {}
-
-VulkanFence::VulkanFence(VkDevice device)
-    : device_(device), handle_(VK_NULL_HANDLE), waiting_(VK_NULL_HANDLE) {
-  const VkFenceCreateInfo fence_create_info{
-      VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-  };
-
-  VK_CHECK(vkCreateFence(device_, &fence_create_info, nullptr, &handle_));
-}
-
-VulkanFence::VulkanFence(VulkanFence&& other) noexcept
-    : device_(other.device_), handle_(other.handle_), waiting_(other.waiting_) {
-  other.handle_ = VK_NULL_HANDLE;
-  other.waiting_ = false;
-}
-
-VulkanFence& VulkanFence::operator=(VulkanFence&& other) noexcept {
-  device_ = other.device_;
-  handle_ = other.handle_;
-  waiting_ = other.waiting_;
-
-  other.device_ = VK_NULL_HANDLE;
-  other.handle_ = VK_NULL_HANDLE;
-  other.waiting_ = false;
-
-  return *this;
-}
-
-VulkanFence::~VulkanFence() {
-  if (VK_NULL_HANDLE == handle_) {
-    return;
-  }
-  vkDestroyFence(device_, handle_, nullptr);
-}
-
-void VulkanFence::wait() {
-  // if get_submit_handle() has not been called, then this will no-op
-  if (waiting_) {
-    VkResult fence_status = VK_NOT_READY;
-    // Run the wait in a loop to keep the CPU hot. A single call to
-    // vkWaitForFences with no timeout may cause the calling thread to be
-    // scheduled out.
-    do {
-      // The timeout (last) arg is in units of ns
-      fence_status = vkWaitForFences(device_, 1u, &handle_, VK_TRUE, 100000);
-
-      VK_CHECK_COND(
-          fence_status != VK_ERROR_DEVICE_LOST,
-          "Vulkan Fence: Device lost while waiting for fence!");
-    } while (fence_status != VK_SUCCESS);
-
-    VK_CHECK(vkResetFences(device_, 1u, &handle_));
-
-    waiting_ = false;
-  }
-}
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Resource.h b/backends/vulkan/runtime/api/Resource.h
deleted file mode 100644
index 247e2f1c932..00000000000
--- a/backends/vulkan/runtime/api/Resource.h
+++ /dev/null
@@ -1,599 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/api/vk_api.h>
-#include <executorch/backends/vulkan/runtime/api/vma_api.h>
-
-#include <executorch/backends/vulkan/runtime/api/Types.h>
-#include <executorch/backends/vulkan/runtime/api/Utils.h>
-
-#include <mutex>
-#include <ostream>
-#include <stack>
-#include <unordered_map>
-
-std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats);
-
-namespace vkcompute {
-namespace api {
-
-using MemoryAccessFlags = uint8_t;
-
-constexpr VmaAllocationCreateFlags DEFAULT_ALLOCATION_STRATEGY =
-    VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT;
-
-enum MemoryAccessType : MemoryAccessFlags {
-  NONE = 0u << 0u,
-  READ = 1u << 0u,
-  WRITE = 1u << 1u,
-};
-
-struct MemoryBarrier final {
-  VkMemoryBarrier handle;
-
-  MemoryBarrier(
-      const VkAccessFlags src_access_flags,
-      const VkAccessFlags dst_access_flags);
-};
-
-struct MemoryAllocation final {
-  explicit MemoryAllocation();
-
-  explicit MemoryAllocation(
-      const VmaAllocator,
-      const VkMemoryRequirements&,
-      const VmaAllocationCreateInfo&);
-
-  MemoryAllocation(const MemoryAllocation&) = delete;
-  MemoryAllocation& operator=(const MemoryAllocation&) = delete;
-
-  MemoryAllocation(MemoryAllocation&&) noexcept;
-  MemoryAllocation& operator=(MemoryAllocation&&) noexcept;
-
-  ~MemoryAllocation();
-
-  VkMemoryRequirements memory_requirements;
-  // The properties this allocation was created with
-  VmaAllocationCreateInfo create_info;
-  // The allocator object this was allocated from
-  VmaAllocator allocator;
-  // Handles to the allocated memory
-  VmaAllocation allocation;
-
-  operator bool() const {
-    return (allocation != VK_NULL_HANDLE);
-  }
-};
-
-class VulkanBuffer final {
- public:
-  struct BufferProperties final {
-    VkDeviceSize size;
-    VkDeviceSize mem_offset;
-    VkDeviceSize mem_range;
-    VkBufferUsageFlags buffer_usage;
-  };
-
-  explicit VulkanBuffer();
-
-  explicit VulkanBuffer(
-      const VmaAllocator,
-      const VkDeviceSize,
-      const VmaAllocationCreateInfo&,
-      const VkBufferUsageFlags,
-      const bool allocate_memory = true);
-
-  VulkanBuffer(const VulkanBuffer&) = delete;
-  VulkanBuffer& operator=(const VulkanBuffer&) = delete;
-
-  VulkanBuffer(VulkanBuffer&&) noexcept;
-  VulkanBuffer& operator=(VulkanBuffer&&) noexcept;
-
-  ~VulkanBuffer();
-
-  struct Package final {
-    VkBuffer handle;
-    VkDeviceSize buffer_offset;
-    VkDeviceSize buffer_range;
-  };
-
-  friend struct BufferMemoryBarrier;
-
- private:
-  BufferProperties buffer_properties_;
-  VmaAllocator allocator_;
-  MemoryAllocation memory_;
-  // Indicates whether the underlying memory is owned by this resource
-  bool owns_memory_;
-  VkBuffer handle_;
-
- public:
-  inline VkDevice device() const {
-    VmaAllocatorInfo allocator_info{};
-    vmaGetAllocatorInfo(allocator_, &allocator_info);
-    return allocator_info.device;
-  }
-
-  inline VmaAllocator vma_allocator() const {
-    return allocator_;
-  }
-
-  inline VmaAllocation allocation() const {
-    return memory_.allocation;
-  }
-
-  inline VmaAllocationCreateInfo allocation_create_info() const {
-    return VmaAllocationCreateInfo(memory_.create_info);
-  }
-
-  inline VkBuffer handle() const {
-    return handle_;
-  }
-
-  inline VkDeviceSize mem_offset() const {
-    return buffer_properties_.mem_offset;
-  }
-
-  inline VkDeviceSize mem_range() const {
-    return buffer_properties_.mem_range;
-  }
-
-  inline VkDeviceSize mem_size() const {
-    return buffer_properties_.size;
-  }
-
-  inline bool has_memory() const {
-    return (memory_.allocation != VK_NULL_HANDLE);
-  }
-
-  inline bool owns_memory() const {
-    return owns_memory_;
-  }
-
-  operator bool() const {
-    return (handle_ != VK_NULL_HANDLE);
-  }
-
-  inline void bind_allocation(const MemoryAllocation& memory) {
-    VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
-    VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
-    memory_.allocation = memory.allocation;
-  }
-
-  VkMemoryRequirements get_memory_requirements() const;
-};
-
-class MemoryMap final {
- public:
-  explicit MemoryMap(
-      const VulkanBuffer& buffer,
-      const MemoryAccessFlags access);
-
-  MemoryMap(const MemoryMap&) = delete;
-  MemoryMap& operator=(const MemoryMap&) = delete;
-
-  MemoryMap(MemoryMap&&) noexcept;
-  MemoryMap& operator=(MemoryMap&&) = delete;
-
-  ~MemoryMap();
-
- private:
-  uint8_t access_;
-  VmaAllocator allocator_;
-  VmaAllocation allocation_;
-  void* data_;
-  VkDeviceSize data_len_;
-
- public:
-  template <typename T>
-  T* data() {
-    return reinterpret_cast<T*>(data_);
-  }
-
-  inline size_t nbytes() {
-    return utils::safe_downcast<size_t>(data_len_);
-  }
-
-  void invalidate();
-};
-
-struct BufferMemoryBarrier final {
-  VkBufferMemoryBarrier handle;
-
-  BufferMemoryBarrier(
-      const VkAccessFlags src_access_flags,
-      const VkAccessFlags dst_access_flags,
-      const VulkanBuffer& buffer);
-};
-
-class ImageSampler final {
- public:
-  struct Properties final {
-    VkFilter filter;
-    VkSamplerMipmapMode mipmap_mode;
-    VkSamplerAddressMode address_mode;
-    VkBorderColor border_color;
-  };
-
-  explicit ImageSampler(VkDevice, const Properties&);
-
-  ImageSampler(const ImageSampler&) = delete;
-  ImageSampler& operator=(const ImageSampler&) = delete;
-
-  ImageSampler(ImageSampler&&) noexcept;
-  ImageSampler& operator=(ImageSampler&&) = delete;
-
-  ~ImageSampler();
-
- private:
-  VkDevice device_;
-  VkSampler handle_;
-
- public:
-  VkSampler handle() const {
-    return handle_;
-  }
-
-  struct Hasher {
-    size_t operator()(const Properties&) const;
-  };
-
-  // We need to define a custom swap function since this class
-  // does not allow for move assignment. The swap function will
-  // be used in the hash map.
-  friend void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept;
-};
-
-class VulkanImage final {
- public:
-  struct ImageProperties final {
-    VkImageType image_type;
-    VkFormat image_format;
-    VkExtent3D image_extents;
-    VkImageUsageFlags image_usage;
-  };
-
-  struct ViewProperties final {
-    VkImageViewType view_type;
-    VkFormat view_format;
-  };
-
-  using SamplerProperties = ImageSampler::Properties;
-
-  struct Handles final {
-    VkImage image;
-    VkImageView image_view;
-    VkSampler sampler;
-  };
-
-  explicit VulkanImage();
-
-  explicit VulkanImage(
-      const VmaAllocator,
-      const VmaAllocationCreateInfo&,
-      const ImageProperties&,
-      const ViewProperties&,
-      const SamplerProperties&,
-      const VkImageLayout layout,
-      VkSampler,
-      const bool allocate_memory = true);
-
-  VulkanImage(const VulkanImage&) = delete;
-  VulkanImage& operator=(const VulkanImage&) = delete;
-
-  VulkanImage(VulkanImage&&) noexcept;
-  VulkanImage& operator=(VulkanImage&&) noexcept;
-
-  ~VulkanImage();
-
-  struct Package final {
-    VkImage handle;
-    VkImageLayout image_layout;
-    VkImageView image_view;
-    VkSampler image_sampler;
-  };
-
-  friend struct ImageMemoryBarrier;
-
- private:
-  ImageProperties image_properties_;
-  ViewProperties view_properties_;
-  SamplerProperties sampler_properties_;
-  // The allocator object this was allocated from
-  VmaAllocator allocator_;
-  // Handles to the allocated memory
-  MemoryAllocation memory_;
-  // Indicates whether the underlying memory is owned by this resource
-  bool owns_memory_;
-  Handles handles_;
-  // Layout
-  VkImageLayout layout_;
-
- public:
-  void create_image_view();
-
-  inline VkDevice device() const {
-    VmaAllocatorInfo allocator_info{};
-    vmaGetAllocatorInfo(allocator_, &allocator_info);
-    return allocator_info.device;
-  }
-
-  inline VmaAllocator vma_allocator() const {
-    return allocator_;
-  }
-
-  inline VmaAllocation allocation() const {
-    return memory_.allocation;
-  }
-
-  inline VmaAllocationCreateInfo allocation_create_info() const {
-    return VmaAllocationCreateInfo(memory_.create_info);
-  }
-
-  inline VkFormat format() const {
-    return image_properties_.image_format;
-  }
-
-  inline VkExtent3D extents() const {
-    return image_properties_.image_extents;
-  }
-
-  inline VkImage handle() const {
-    return handles_.image;
-  }
-
-  inline VkImageView image_view() const {
-    return handles_.image_view;
-  }
-
-  inline VkSampler sampler() const {
-    return handles_.sampler;
-  }
-
-  Package package() const {
-    return {
-        handles_.image,
-        layout_,
-        handles_.image_view,
-        handles_.sampler,
-    };
-  }
-
-  inline VkImageLayout layout() const {
-    return layout_;
-  }
-
-  inline void set_layout(const VkImageLayout layout) {
-    layout_ = layout;
-  }
-
-  inline bool has_memory() const {
-    return (memory_.allocation != VK_NULL_HANDLE);
-  }
-
-  inline bool owns_memory() const {
-    return owns_memory_;
-  }
-
-  inline operator bool() const {
-    return (handles_.image != VK_NULL_HANDLE);
-  }
-
-  inline void bind_allocation(const MemoryAllocation& memory) {
-    VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
-    VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image));
-    memory_.allocation = memory.allocation;
-
-    // Only create the image view if the image has been bound to memory
-    create_image_view();
-  }
-
-  VkMemoryRequirements get_memory_requirements() const;
-};
-
-struct ImageMemoryBarrier final {
-  VkImageMemoryBarrier handle;
-
-  ImageMemoryBarrier(
-      const VkAccessFlags src_access_flags,
-      const VkAccessFlags dst_access_flags,
-      const VkImageLayout src_layout_flags,
-      const VkImageLayout dst_layout_flags,
-      const VulkanImage& image);
-};
-
-class SamplerCache final {
- public:
-  explicit SamplerCache(VkDevice device);
-
-  SamplerCache(const SamplerCache&) = delete;
-  SamplerCache& operator=(const SamplerCache&) = delete;
-
-  SamplerCache(SamplerCache&&) noexcept;
-  SamplerCache& operator=(SamplerCache&&) = delete;
-
-  ~SamplerCache();
-
-  using Key = ImageSampler::Properties;
-  using Value = ImageSampler;
-  using Hasher = ImageSampler::Hasher;
-
- private:
-  // Multiple threads could potentially be adding entries into the cache, so use
-  // a mutex to manage access
-  std::mutex cache_mutex_;
-
-  VkDevice device_;
-  std::unordered_map<Key, Value, Hasher> cache_;
-
- public:
-  VkSampler retrieve(const Key&);
-  void purge();
-};
-
-class MemoryAllocator final {
- public:
-  explicit MemoryAllocator(
-      VkInstance instance,
-      VkPhysicalDevice physical_device,
-      VkDevice device);
-
-  MemoryAllocator(const MemoryAllocator&) = delete;
-  MemoryAllocator& operator=(const MemoryAllocator&) = delete;
-
-  MemoryAllocator(MemoryAllocator&&) noexcept;
-  MemoryAllocator& operator=(MemoryAllocator&&) = delete;
-
-  ~MemoryAllocator();
-
- private:
-  VkInstance instance_;
-  VkPhysicalDevice physical_device_;
-  VkDevice device_;
-  VmaAllocator allocator_;
-
- public:
-  MemoryAllocation create_allocation(
-      const VkMemoryRequirements& memory_requirements,
-      const VmaAllocationCreateInfo& create_info);
-
-  VulkanImage create_image(
-      const VkExtent3D&,
-      const VkFormat,
-      const VkImageType,
-      const VkImageViewType,
-      const VulkanImage::SamplerProperties&,
-      VkSampler,
-      const bool allow_transfer = false,
-      const bool allocate_memory = true);
-
-  VulkanBuffer create_storage_buffer(
-      const VkDeviceSize,
-      const bool gpu_only = true,
-      const bool allocate_memory = true);
-
-  VulkanBuffer create_staging_buffer(const VkDeviceSize);
-
-  /*
-   * Create a uniform buffer with a specified size
-   */
-  VulkanBuffer create_uniform_buffer(const VkDeviceSize);
-
-  /*
-   * Create a uniform buffer containing the data in an arbitrary struct
-   */
-  template <typename Block>
-  VulkanBuffer create_params_buffer(const Block& block);
-
-  VmaTotalStatistics get_memory_statistics() const {
-    VmaTotalStatistics stats = {};
-    vmaCalculateStatistics(allocator_, &stats);
-    return stats;
-  }
-};
-
-class VulkanFence final {
- public:
-  // TODO: This is required for the lazy allocation pattern in api/Tensor.
-  //       It will be disabled pending future refactors.
-  explicit VulkanFence();
-
-  explicit VulkanFence(VkDevice);
-
-  VulkanFence(const VulkanFence&) = delete;
-  VulkanFence& operator=(const VulkanFence&) = delete;
-
-  VulkanFence(VulkanFence&&) noexcept;
-  VulkanFence& operator=(VulkanFence&&) noexcept;
-
-  ~VulkanFence();
-
- private:
-  VkDevice device_;
-  VkFence handle_;
-  bool waiting_;
-
- public:
-  // Used to get the handle for a queue submission.
-  VkFence get_submit_handle() {
-    if (handle_ != VK_NULL_HANDLE) {
-      // Indicate we are now waiting for this fence to be signaled
-      waiting_ = true;
-    }
-    return handle_;
-  }
-
-  VkFence handle() {
-    return handle_;
-  }
-
-  // Trigger a synchronous wait for the fence to be signaled
-  void wait();
-
-  bool waiting() const {
-    return waiting_;
-  }
-
-  operator bool() const {
-    return (VK_NULL_HANDLE != handle_);
-  }
-};
-
-// A pool to track created Fences and reuse ones that are available.
-// Only intended to be modified by one thread at a time.
-struct FencePool final {
-  VkDevice device_;
-
-  std::stack<VulkanFence> pool_;
-
-  explicit FencePool(VkDevice device) : device_(device), pool_{} {}
-
-  // Returns an rvalue reference to a fence, so that it can be moved
-  inline VulkanFence get_fence() {
-    if (pool_.empty()) {
-      VulkanFence new_fence = VulkanFence(device_);
-      return new_fence;
-    }
-
-    VulkanFence top_fence = std::move(pool_.top());
-    pool_.pop();
-
-    return top_fence;
-  }
-
-  // Marks the fence as available
-  inline void return_fence(VulkanFence& fence) {
-    pool_.push(std::move(fence));
-  }
-};
-
-//
-// Impl
-//
-
-template <typename Block>
-inline VulkanBuffer MemoryAllocator::create_params_buffer(const Block& block) {
-  VulkanBuffer uniform_buffer = create_uniform_buffer(sizeof(Block));
-
-  // Fill the uniform buffer with data in block
-  {
-    MemoryMap mapping(uniform_buffer, MemoryAccessType::WRITE);
-    Block* data_ptr = mapping.template data<Block>();
-
-    *data_ptr = block;
-  }
-
-  return uniform_buffer;
-}
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Tensor.cpp b/backends/vulkan/runtime/api/Tensor.cpp
index 402d35d75bb..cb4e0848191 100644
--- a/backends/vulkan/runtime/api/Tensor.cpp
+++ b/backends/vulkan/runtime/api/Tensor.cpp
@@ -234,7 +234,7 @@ VkMemoryRequirements vTensor::get_memory_requirements() const {
   return {};
 }
 
-void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
+void vTensor::bind_allocation(const api::Allocation& allocation) {
   switch (storage_type()) {
     case api::kBuffer:
       storage_.buffer_.bind_allocation(allocation);
diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
index 787e8111204..0ddd5d9a4f1 100644
--- a/backends/vulkan/runtime/api/Tensor.h
+++ b/backends/vulkan/runtime/api/Tensor.h
@@ -259,7 +259,7 @@ class vTensor final {
   /*
    * Binds the underlying resource to the given memory allocation
    */
-  void bind_allocation(const api::MemoryAllocation& allocation);
+  void bind_allocation(const api::Allocation& allocation);
 
  private:
   /*
diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h
index 117f326cb45..16e2b969871 100644
--- a/backends/vulkan/runtime/api/api.h
+++ b/backends/vulkan/runtime/api/api.h
@@ -12,10 +12,15 @@
 #include <executorch/backends/vulkan/runtime/api/Command.h>
 #include <executorch/backends/vulkan/runtime/api/Context.h>
 #include <executorch/backends/vulkan/runtime/api/Descriptor.h>
+#include <executorch/backends/vulkan/runtime/api/Fence.h>
 #include <executorch/backends/vulkan/runtime/api/Pipeline.h>
-#include <executorch/backends/vulkan/runtime/api/Resource.h>
 #include <executorch/backends/vulkan/runtime/api/Runtime.h>
 #include <executorch/backends/vulkan/runtime/api/Shader.h>
 #include <executorch/backends/vulkan/runtime/api/ShaderRegistry.h>
 #include <executorch/backends/vulkan/runtime/api/Tensor.h>
 #include <executorch/backends/vulkan/runtime/api/Utils.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/Allocation.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Allocator.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Buffer.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Image.h>
diff --git a/backends/vulkan/runtime/api/memory/Allocation.cpp b/backends/vulkan/runtime/api/memory/Allocation.cpp
new file mode 100644
index 00000000000..9bde2ac744d
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Allocation.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/api/memory/Allocation.h>
+
+#define PRINT_FIELD(struct, field) #field << ": " << struct.field << std::endl
+
+std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats) {
+  VmaDetailedStatistics total_stats = stats.total;
+  out << "VmaTotalStatistics: " << std::endl;
+  out << "  " << PRINT_FIELD(total_stats.statistics, blockCount);
+  out << "  " << PRINT_FIELD(total_stats.statistics, allocationCount);
+  out << "  " << PRINT_FIELD(total_stats.statistics, blockBytes);
+  out << "  " << PRINT_FIELD(total_stats.statistics, allocationBytes);
+  return out;
+}
+
+#undef PRINT_FIELD
+
+namespace vkcompute {
+namespace api {
+
+Allocation::Allocation()
+    : memory_requirements{},
+      create_info{},
+      allocator(VK_NULL_HANDLE),
+      allocation(VK_NULL_HANDLE) {}
+
+Allocation::Allocation(
+    VmaAllocator vma_allocator,
+    const VkMemoryRequirements& mem_props,
+    const VmaAllocationCreateInfo& create_info)
+    : memory_requirements(mem_props),
+      create_info(create_info),
+      allocator(vma_allocator),
+      allocation(VK_NULL_HANDLE) {
+  VK_CHECK(vmaAllocateMemory(
+      allocator, &memory_requirements, &create_info, &allocation, nullptr));
+}
+
+Allocation::Allocation(Allocation&& other) noexcept
+    : memory_requirements(other.memory_requirements),
+      create_info(other.create_info),
+      allocator(other.allocator),
+      allocation(other.allocation) {
+  other.allocation = VK_NULL_HANDLE;
+}
+
+Allocation& Allocation::operator=(Allocation&& other) noexcept {
+  VmaAllocation tmp_allocation = allocation;
+
+  memory_requirements = other.memory_requirements;
+  create_info = other.create_info;
+  allocator = other.allocator;
+  allocation = other.allocation;
+
+  other.allocation = tmp_allocation;
+
+  return *this;
+}
+
+Allocation::~Allocation() {
+  if (VK_NULL_HANDLE != allocation) {
+    vmaFreeMemory(allocator, allocation);
+  }
+}
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/memory/Allocation.h b/backends/vulkan/runtime/api/memory/Allocation.h
new file mode 100644
index 00000000000..b93556bd501
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Allocation.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+
+#include <executorch/backends/vulkan/runtime/api/vk_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/Exception.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/vma_api.h>
+
+#include <ostream>
+
+std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats);
+
+namespace vkcompute {
+namespace api {
+
+struct Allocation final {
+  explicit Allocation();
+
+  explicit Allocation(
+      const VmaAllocator,
+      const VkMemoryRequirements&,
+      const VmaAllocationCreateInfo&);
+
+  Allocation(const Allocation&) = delete;
+  Allocation& operator=(const Allocation&) = delete;
+
+  Allocation(Allocation&&) noexcept;
+  Allocation& operator=(Allocation&&) noexcept;
+
+  ~Allocation();
+
+  VkMemoryRequirements memory_requirements;
+  // The properties this allocation was created with
+  VmaAllocationCreateInfo create_info;
+  // The allocator object this was allocated from
+  VmaAllocator allocator;
+  // Handles to the allocated memory
+  VmaAllocation allocation;
+
+  operator bool() const {
+    return (allocation != VK_NULL_HANDLE);
+  }
+};
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/memory/Allocator.cpp b/backends/vulkan/runtime/api/memory/Allocator.cpp
new file mode 100644
index 00000000000..5749ecd0714
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Allocator.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/api/memory/Allocator.h>
+
+namespace vkcompute {
+namespace api {
+
+Allocator::Allocator(
+    VkInstance instance,
+    VkPhysicalDevice physical_device,
+    VkDevice device)
+    : instance_{},
+      physical_device_(physical_device),
+      device_(device),
+      allocator_{VK_NULL_HANDLE} {
+  VmaVulkanFunctions vk_functions{};
+  vk_functions.vkGetInstanceProcAddr = vkGetInstanceProcAddr;
+  vk_functions.vkGetDeviceProcAddr = vkGetDeviceProcAddr;
+
+  const VmaAllocatorCreateInfo allocator_create_info{
+      0u, // flags
+      physical_device_, // physicalDevice
+      device_, // device
+      0u, // preferredLargeHeapBlockSize
+      nullptr, // pAllocationCallbacks
+      nullptr, // pDeviceMemoryCallbacks
+      nullptr, // pHeapSizeLimit
+      &vk_functions, // pVulkanFunctions
+      instance, // instance
+      VK_API_VERSION_1_0, // vulkanApiVersion
+      nullptr, // pTypeExternalMemoryHandleTypes
+  };
+
+  VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator_));
+}
+
+Allocator::Allocator(Allocator&& other) noexcept
+    : instance_(other.instance_),
+      physical_device_(other.physical_device_),
+      device_(other.device_),
+      allocator_(other.allocator_) {
+  other.allocator_ = VK_NULL_HANDLE;
+  other.device_ = VK_NULL_HANDLE;
+  other.physical_device_ = VK_NULL_HANDLE;
+  other.instance_ = VK_NULL_HANDLE;
+}
+
+Allocator::~Allocator() {
+  if (VK_NULL_HANDLE == allocator_) {
+    return;
+  }
+  vmaDestroyAllocator(allocator_);
+}
+
+Allocation Allocator::create_allocation(
+    const VkMemoryRequirements& memory_requirements,
+    const VmaAllocationCreateInfo& create_info) {
+  VmaAllocationCreateInfo alloc_create_info = create_info;
+  // Protect against using VMA_MEMORY_USAGE_AUTO_* flags when allocating memory
+  // directly, since those usage flags require that VkBufferCreateInfo and/or
+  // VkImageCreateInfo also be available.
+  switch (create_info.usage) {
+    // The logic for the below usage options are too complex, therefore prevent
+    // those from being used with direct memory allocation.
+    case VMA_MEMORY_USAGE_AUTO:
+    case VMA_MEMORY_USAGE_AUTO_PREFER_HOST:
+      VK_THROW(
+          "Only the VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE usage flag is compatible with create_allocation()");
+      break;
+    // Most of the time, VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE will simply set the
+    // DEVICE_LOCAL_BIT as a preferred memory flag. Therefore the below is a
+    // decent approximation for VMA behaviour.
+    case VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE:
+      alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+      alloc_create_info.usage = VMA_MEMORY_USAGE_UNKNOWN;
+      break;
+    default:
+      break;
+  }
+
+  return Allocation(allocator_, memory_requirements, alloc_create_info);
+}
+
+VulkanImage Allocator::create_image(
+    const VkExtent3D& extents,
+    const VkFormat image_format,
+    const VkImageType image_type,
+    const VkImageViewType image_view_type,
+    const VulkanImage::SamplerProperties& sampler_props,
+    VkSampler sampler,
+    const bool allow_transfer,
+    const bool allocate_memory) {
+  VkImageUsageFlags usage =
+      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT;
+  if (allow_transfer) {
+    usage |=
+        (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+  }
+
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+
+  const VulkanImage::ImageProperties image_props{
+      image_type,
+      image_format,
+      extents,
+      usage,
+  };
+
+  const VulkanImage::ViewProperties view_props{
+      image_view_type,
+      image_format,
+  };
+
+  const VkImageLayout initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+
+  return VulkanImage(
+      allocator_,
+      alloc_create_info,
+      image_props,
+      view_props,
+      sampler_props,
+      initial_layout,
+      sampler,
+      allocate_memory);
+}
+
+VulkanBuffer Allocator::create_storage_buffer(
+    const VkDeviceSize size,
+    const bool gpu_only,
+    const bool allocate_memory) {
+  const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+
+  // The create storage buffer will be accessed by both the CPU and GPU, so set
+  // the appropriate flags to indicate that the host device will be accessing
+  // the data from this buffer.
+  if (!gpu_only) {
+    // Deferred memory allocation should only be used for GPU only buffers.
+    VK_CHECK_COND(
+        allocate_memory,
+        "Only GPU-only buffers should use deferred memory allocation");
+
+    alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+    alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+    alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+    alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+        VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+  }
+
+  return VulkanBuffer(
+      allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
+}
+
+VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+
+  VkBufferUsageFlags buffer_usage =
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
+}
+
+VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) {
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY |
+      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO;
+
+  VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+
+  VulkanBuffer uniform_buffer(
+      allocator_, size, alloc_create_info, buffer_usage);
+  return uniform_buffer;
+}
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/memory/Allocator.h b/backends/vulkan/runtime/api/memory/Allocator.h
new file mode 100644
index 00000000000..f1d3a449f56
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Allocator.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+
+#include <executorch/backends/vulkan/runtime/api/vk_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/Utils.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/vma_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/Allocation.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Buffer.h>
+#include <executorch/backends/vulkan/runtime/api/memory/Image.h>
+
+namespace vkcompute {
+namespace api {
+
+constexpr VmaAllocationCreateFlags DEFAULT_ALLOCATION_STRATEGY =
+    VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT;
+
+class Allocator final {
+ public:
+  explicit Allocator(
+      VkInstance instance,
+      VkPhysicalDevice physical_device,
+      VkDevice device);
+
+  Allocator(const Allocator&) = delete;
+  Allocator& operator=(const Allocator&) = delete;
+
+  Allocator(Allocator&&) noexcept;
+  Allocator& operator=(Allocator&&) = delete;
+
+  ~Allocator();
+
+ private:
+  VkInstance instance_;
+  VkPhysicalDevice physical_device_;
+  VkDevice device_;
+  VmaAllocator allocator_;
+
+ public:
+  Allocation create_allocation(
+      const VkMemoryRequirements& memory_requirements,
+      const VmaAllocationCreateInfo& create_info);
+
+  VulkanImage create_image(
+      const VkExtent3D&,
+      const VkFormat,
+      const VkImageType,
+      const VkImageViewType,
+      const VulkanImage::SamplerProperties&,
+      VkSampler,
+      const bool allow_transfer = false,
+      const bool allocate_memory = true);
+
+  VulkanBuffer create_storage_buffer(
+      const VkDeviceSize,
+      const bool gpu_only = true,
+      const bool allocate_memory = true);
+
+  VulkanBuffer create_staging_buffer(const VkDeviceSize);
+
+  /*
+   * Create a uniform buffer with a specified size
+   */
+  VulkanBuffer create_uniform_buffer(const VkDeviceSize);
+
+  /*
+   * Create a uniform buffer containing the data in an arbitrary struct
+   */
+  template <typename Block>
+  VulkanBuffer create_params_buffer(const Block& block);
+
+  VmaTotalStatistics get_memory_statistics() const {
+    VmaTotalStatistics stats = {};
+    vmaCalculateStatistics(allocator_, &stats);
+    return stats;
+  }
+};
+
+//
+// Impl
+//
+
+template <typename Block>
+inline VulkanBuffer Allocator::create_params_buffer(const Block& block) {
+  VulkanBuffer uniform_buffer = create_uniform_buffer(sizeof(Block));
+
+  // Fill the uniform buffer with data in block
+  {
+    MemoryMap mapping(uniform_buffer, MemoryAccessType::WRITE);
+    Block* data_ptr = mapping.template data<Block>();
+
+    *data_ptr = block;
+  }
+
+  return uniform_buffer;
+}
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/memory/Buffer.cpp b/backends/vulkan/runtime/api/memory/Buffer.cpp
new file mode 100644
index 00000000000..b12f1bf8deb
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Buffer.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/api/memory/Buffer.h>
+
+namespace vkcompute {
+namespace api {
+
+//
+// VulkanBuffer
+//
+
+VulkanBuffer::VulkanBuffer()
+    : buffer_properties_{},
+      allocator_(VK_NULL_HANDLE),
+      memory_{},
+      owns_memory_(false),
+      handle_(VK_NULL_HANDLE) {}
+
+VulkanBuffer::VulkanBuffer(
+    VmaAllocator vma_allocator,
+    const VkDeviceSize size,
+    const VmaAllocationCreateInfo& allocation_create_info,
+    const VkBufferUsageFlags usage,
+    const bool allocate_memory)
+    : buffer_properties_({
+          size,
+          0u,
+          size,
+          usage,
+      }),
+      allocator_(vma_allocator),
+      memory_{},
+      owns_memory_(allocate_memory),
+      handle_(VK_NULL_HANDLE) {
+  // Only allocate memory if the buffer has non-zero size
+  if (size == 0) {
+    return;
+  }
+
+  const VkBufferCreateInfo buffer_create_info{
+      VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // sType
+      nullptr, // pNext
+      0u, // flags
+      size, // size
+      buffer_properties_.buffer_usage, // usage
+      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
+      0u, // queueFamilyIndexCount
+      nullptr, // pQueueFamilyIndices
+  };
+
+  memory_.create_info = allocation_create_info;
+
+  if (allocate_memory) {
+    VK_CHECK(vmaCreateBuffer(
+        allocator_,
+        &buffer_create_info,
+        &allocation_create_info,
+        &handle_,
+        &(memory_.allocation),
+        nullptr));
+  } else {
+    VmaAllocatorInfo allocator_info{};
+    vmaGetAllocatorInfo(allocator_, &allocator_info);
+    VK_CHECK(vkCreateBuffer(
+        allocator_info.device, &buffer_create_info, nullptr, &handle_));
+  }
+}
+
+VulkanBuffer::VulkanBuffer(VulkanBuffer&& other) noexcept
+    : buffer_properties_(other.buffer_properties_),
+      allocator_(other.allocator_),
+      memory_(std::move(other.memory_)),
+      owns_memory_(other.owns_memory_),
+      handle_(other.handle_) {
+  other.handle_ = VK_NULL_HANDLE;
+}
+
+VulkanBuffer& VulkanBuffer::operator=(VulkanBuffer&& other) noexcept {
+  VkBuffer tmp_buffer = handle_;
+  bool tmp_owns_memory = owns_memory_;
+
+  buffer_properties_ = other.buffer_properties_;
+  allocator_ = other.allocator_;
+  memory_ = std::move(other.memory_);
+  owns_memory_ = other.owns_memory_;
+  handle_ = other.handle_;
+
+  other.handle_ = tmp_buffer;
+  other.owns_memory_ = tmp_owns_memory;
+
+  return *this;
+}
+
+VulkanBuffer::~VulkanBuffer() {
+  if (VK_NULL_HANDLE != handle_) {
+    if (owns_memory_) {
+      vmaDestroyBuffer(allocator_, handle_, memory_.allocation);
+    } else {
+      vkDestroyBuffer(this->device(), handle_, nullptr);
+    }
+    // Prevent the underlying memory allocation from being freed; it was either
+    // freed by vmaDestroyBuffer, or this resource does not own the underlying
+    // memory
+    memory_.allocation = VK_NULL_HANDLE;
+  }
+}
+
+VkMemoryRequirements VulkanBuffer::get_memory_requirements() const {
+  VkMemoryRequirements memory_requirements;
+  vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements);
+  return memory_requirements;
+}
+
+//
+// MemoryMap
+//
+
+MemoryMap::MemoryMap(const VulkanBuffer& buffer, const uint8_t access)
+    : access_(access),
+      allocator_(buffer.vma_allocator()),
+      allocation_(buffer.allocation()),
+      data_(nullptr),
+      data_len_{buffer.mem_size()} {
+  if (allocation_) {
+    VK_CHECK(vmaMapMemory(allocator_, allocation_, &data_));
+  }
+}
+
+MemoryMap::MemoryMap(MemoryMap&& other) noexcept
+    : access_(other.access_),
+      allocator_(other.allocator_),
+      allocation_(other.allocation_),
+      data_(other.data_),
+      data_len_{other.data_len_} {
+  other.allocation_ = VK_NULL_HANDLE;
+  other.data_ = nullptr;
+}
+
+MemoryMap::~MemoryMap() {
+  if (!data_) {
+    return;
+  }
+
+  if (allocation_) {
+    if (access_ & MemoryAccessType::WRITE) {
+      // Call will be ignored by implementation if the memory type this
+      // allocation belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is
+      // the behavior we want. Don't check the result here as the destructor
+      // cannot throw.
+      vmaFlushAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE);
+    }
+
+    vmaUnmapMemory(allocator_, allocation_);
+  }
+}
+
+void MemoryMap::invalidate() {
+  if (access_ & MemoryAccessType::READ && allocation_) {
+    // Call will be ignored by implementation if the memory type this allocation
+    // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior
+    // we want.
+    VK_CHECK(
+        vmaInvalidateAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE));
+  }
+}
+
+//
+// BufferMemoryBarrier
+//
+
+BufferMemoryBarrier::BufferMemoryBarrier(
+    const VkAccessFlags src_access_flags,
+    const VkAccessFlags dst_access_flags,
+    const VulkanBuffer& buffer)
+    : handle{
+          VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, // sType
+          nullptr, // pNext
+          src_access_flags, // srcAccessMask
+          dst_access_flags, // dstAccessMask
+          VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex
+          VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex
+          buffer.handle_, // buffer
+          buffer.buffer_properties_.mem_offset, // offset
+          buffer.buffer_properties_.mem_range, // size
+      } {}
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/memory/Buffer.h b/backends/vulkan/runtime/api/memory/Buffer.h
new file mode 100644
index 00000000000..c0eea5bea6e
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Buffer.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+
+#include <executorch/backends/vulkan/runtime/api/vk_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/Utils.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/vma_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/Allocation.h>
+
+namespace vkcompute {
+namespace api {
+
+using MemoryAccessFlags = uint8_t;
+
+enum MemoryAccessType : MemoryAccessFlags {
+  NONE = 0u << 0u,
+  READ = 1u << 0u,
+  WRITE = 1u << 1u,
+};
+
+class VulkanBuffer final {
+ public:
+  struct BufferProperties final {
+    VkDeviceSize size;
+    VkDeviceSize mem_offset;
+    VkDeviceSize mem_range;
+    VkBufferUsageFlags buffer_usage;
+  };
+
+  explicit VulkanBuffer();
+
+  explicit VulkanBuffer(
+      const VmaAllocator,
+      const VkDeviceSize,
+      const VmaAllocationCreateInfo&,
+      const VkBufferUsageFlags,
+      const bool allocate_memory = true);
+
+  VulkanBuffer(const VulkanBuffer&) = delete;
+  VulkanBuffer& operator=(const VulkanBuffer&) = delete;
+
+  VulkanBuffer(VulkanBuffer&&) noexcept;
+  VulkanBuffer& operator=(VulkanBuffer&&) noexcept;
+
+  ~VulkanBuffer();
+
+  struct Package final {
+    VkBuffer handle;
+    VkDeviceSize buffer_offset;
+    VkDeviceSize buffer_range;
+  };
+
+  friend struct BufferMemoryBarrier;
+
+ private:
+  BufferProperties buffer_properties_;
+  VmaAllocator allocator_;
+  Allocation memory_;
+  // Indicates whether the underlying memory is owned by this resource
+  bool owns_memory_;
+  VkBuffer handle_;
+
+ public:
+  inline VkDevice device() const {
+    VmaAllocatorInfo allocator_info{};
+    vmaGetAllocatorInfo(allocator_, &allocator_info);
+    return allocator_info.device;
+  }
+
+  inline VmaAllocator vma_allocator() const {
+    return allocator_;
+  }
+
+  inline VmaAllocation allocation() const {
+    return memory_.allocation;
+  }
+
+  inline VmaAllocationCreateInfo allocation_create_info() const {
+    return VmaAllocationCreateInfo(memory_.create_info);
+  }
+
+  inline VkBuffer handle() const {
+    return handle_;
+  }
+
+  inline VkDeviceSize mem_offset() const {
+    return buffer_properties_.mem_offset;
+  }
+
+  inline VkDeviceSize mem_range() const {
+    return buffer_properties_.mem_range;
+  }
+
+  inline VkDeviceSize mem_size() const {
+    return buffer_properties_.size;
+  }
+
+  inline bool has_memory() const {
+    return (memory_.allocation != VK_NULL_HANDLE);
+  }
+
+  inline bool owns_memory() const {
+    return owns_memory_;
+  }
+
+  operator bool() const {
+    return (handle_ != VK_NULL_HANDLE);
+  }
+
+  inline void bind_allocation(const Allocation& memory) {
+    VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
+    VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
+    memory_.allocation = memory.allocation;
+  }
+
+  VkMemoryRequirements get_memory_requirements() const;
+};
+
+class MemoryMap final {
+ public:
+  explicit MemoryMap(
+      const VulkanBuffer& buffer,
+      const MemoryAccessFlags access);
+
+  MemoryMap(const MemoryMap&) = delete;
+  MemoryMap& operator=(const MemoryMap&) = delete;
+
+  MemoryMap(MemoryMap&&) noexcept;
+  MemoryMap& operator=(MemoryMap&&) = delete;
+
+  ~MemoryMap();
+
+ private:
+  uint8_t access_;
+  VmaAllocator allocator_;
+  VmaAllocation allocation_;
+  void* data_;
+  VkDeviceSize data_len_;
+
+ public:
+  template <typename T>
+  T* data() {
+    return reinterpret_cast<T*>(data_);
+  }
+
+  inline size_t nbytes() {
+    return utils::safe_downcast<size_t>(data_len_);
+  }
+
+  void invalidate();
+};
+
+struct BufferMemoryBarrier final {
+  VkBufferMemoryBarrier handle;
+
+  BufferMemoryBarrier(
+      const VkAccessFlags src_access_flags,
+      const VkAccessFlags dst_access_flags,
+      const VulkanBuffer& buffer);
+};
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/memory/Image.cpp b/backends/vulkan/runtime/api/memory/Image.cpp
new file mode 100644
index 00000000000..449dbaf2416
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Image.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/api/memory/Image.h>
+
+namespace vkcompute {
+namespace api {
+
+//
+// ImageSampler
+//
+
+bool operator==(
+    const ImageSampler::Properties& _1,
+    const ImageSampler::Properties& _2) {
+  return (
+      _1.filter == _2.filter && _1.mipmap_mode == _2.mipmap_mode &&
+      _1.address_mode == _2.address_mode && _1.border_color == _2.border_color);
+}
+
+ImageSampler::ImageSampler(
+    VkDevice device,
+    const ImageSampler::Properties& props)
+    : device_(device), handle_(VK_NULL_HANDLE) {
+  const VkSamplerCreateInfo sampler_create_info{
+      VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, // sType
+      nullptr, // pNext
+      0u, // flags
+      props.filter, // magFilter
+      props.filter, // minFilter
+      props.mipmap_mode, // mipmapMode
+      props.address_mode, // addressModeU
+      props.address_mode, // addressModeV
+      props.address_mode, // addressModeW
+      0.0f, // mipLodBias
+      VK_FALSE, // anisotropyEnable
+      1.0f, // maxAnisotropy,
+      VK_FALSE, // compareEnable
+      VK_COMPARE_OP_NEVER, // compareOp
+      0.0f, // minLod
+      VK_LOD_CLAMP_NONE, // maxLod
+      props.border_color, // borderColor
+      VK_FALSE, // unnormalizedCoordinates
+  };
+
+  VK_CHECK(vkCreateSampler(device_, &sampler_create_info, nullptr, &handle_));
+}
+
+ImageSampler::ImageSampler(ImageSampler&& other) noexcept
+    : device_(other.device_), handle_(other.handle_) {
+  other.handle_ = VK_NULL_HANDLE;
+}
+
+ImageSampler::~ImageSampler() {
+  if (VK_NULL_HANDLE == handle_) {
+    return;
+  }
+  vkDestroySampler(device_, handle_, nullptr);
+}
+
+size_t ImageSampler::Hasher::operator()(
+    const ImageSampler::Properties& props) const {
+  size_t seed = 0;
+  seed = utils::hash_combine(seed, std::hash<VkFilter>()(props.filter));
+  seed = utils::hash_combine(
+      seed, std::hash<VkSamplerMipmapMode>()(props.mipmap_mode));
+  seed = utils::hash_combine(
+      seed, std::hash<VkSamplerAddressMode>()(props.address_mode));
+  seed =
+      utils::hash_combine(seed, std::hash<VkBorderColor>()(props.border_color));
+  return seed;
+}
+
+void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept {
+  VkDevice tmp_device = lhs.device_;
+  VkSampler tmp_handle = lhs.handle_;
+
+  lhs.device_ = rhs.device_;
+  lhs.handle_ = rhs.handle_;
+
+  rhs.device_ = tmp_device;
+  rhs.handle_ = tmp_handle;
+}
+
+//
+// VulkanImage
+//
+
+VulkanImage::VulkanImage()
+    : image_properties_{},
+      view_properties_{},
+      sampler_properties_{},
+      allocator_(VK_NULL_HANDLE),
+      memory_{},
+      owns_memory_(false),
+      handles_{
+          VK_NULL_HANDLE,
+          VK_NULL_HANDLE,
+          VK_NULL_HANDLE,
+      },
+      layout_{} {}
+
+VulkanImage::VulkanImage(
+    VmaAllocator vma_allocator,
+    const VmaAllocationCreateInfo& allocation_create_info,
+    const ImageProperties& image_props,
+    const ViewProperties& view_props,
+    const SamplerProperties& sampler_props,
+    const VkImageLayout layout,
+    VkSampler sampler,
+    const bool allocate_memory)
+    : image_properties_(image_props),
+      view_properties_(view_props),
+      sampler_properties_(sampler_props),
+      allocator_(vma_allocator),
+      memory_{},
+      owns_memory_{allocate_memory},
+      handles_{
+          VK_NULL_HANDLE,
+          VK_NULL_HANDLE,
+          sampler,
+      },
+      layout_(layout) {
+  VmaAllocatorInfo allocator_info{};
+  vmaGetAllocatorInfo(allocator_, &allocator_info);
+
+  // If any dims are zero, then no memory will be allocated for the image.
+  if (image_props.image_extents.width == 0 ||
+      image_props.image_extents.height == 0 ||
+      image_props.image_extents.depth == 0) {
+    return;
+  }
+
+  const VkImageCreateInfo image_create_info{
+      VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType
+      nullptr, // pNext
+      0u, // flags
+      image_properties_.image_type, // imageType
+      image_properties_.image_format, // format
+      image_properties_.image_extents, // extents
+      1u, // mipLevels
+      1u, // arrayLayers
+      VK_SAMPLE_COUNT_1_BIT, // samples
+      VK_IMAGE_TILING_OPTIMAL, // tiling
+      image_properties_.image_usage, // usage
+      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
+      0u, // queueFamilyIndexCount
+      nullptr, // pQueueFamilyIndices
+      layout_, // initialLayout
+  };
+
+  memory_.create_info = allocation_create_info;
+
+  if (allocate_memory) {
+    VK_CHECK(vmaCreateImage(
+        allocator_,
+        &image_create_info,
+        &allocation_create_info,
+        &(handles_.image),
+        &(memory_.allocation),
+        nullptr));
+    // Only create the image view if the image has been bound to memory
+    create_image_view();
+  } else {
+    VK_CHECK(vkCreateImage(
+        allocator_info.device, &image_create_info, nullptr, &(handles_.image)));
+  }
+}
+
+VulkanImage::VulkanImage(VulkanImage&& other) noexcept
+    : image_properties_(other.image_properties_),
+      view_properties_(other.view_properties_),
+      sampler_properties_(other.sampler_properties_),
+      allocator_(other.allocator_),
+      memory_(std::move(other.memory_)),
+      owns_memory_(other.owns_memory_),
+      handles_(other.handles_),
+      layout_(other.layout_) {
+  other.handles_.image = VK_NULL_HANDLE;
+  other.handles_.image_view = VK_NULL_HANDLE;
+  other.handles_.sampler = VK_NULL_HANDLE;
+  other.owns_memory_ = false;
+}
+
+VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
+  VkImage tmp_image = handles_.image;
+  VkImageView tmp_image_view = handles_.image_view;
+  bool tmp_owns_memory = owns_memory_;
+
+  image_properties_ = other.image_properties_;
+  view_properties_ = other.view_properties_;
+  sampler_properties_ = other.sampler_properties_;
+  allocator_ = other.allocator_;
+  memory_ = std::move(other.memory_);
+  owns_memory_ = other.owns_memory_;
+  handles_ = other.handles_;
+  layout_ = other.layout_;
+
+  other.handles_.image = tmp_image;
+  other.handles_.image_view = tmp_image_view;
+  other.owns_memory_ = tmp_owns_memory;
+
+  return *this;
+}
+
+VulkanImage::~VulkanImage() {
+  if (VK_NULL_HANDLE != handles_.image_view) {
+    vkDestroyImageView(this->device(), handles_.image_view, nullptr);
+  }
+
+  if (VK_NULL_HANDLE != handles_.image) {
+    if (owns_memory_) {
+      vmaDestroyImage(allocator_, handles_.image, memory_.allocation);
+    } else {
+      vkDestroyImage(this->device(), handles_.image, nullptr);
+    }
+    // Prevent the underlying memory allocation from being freed; it was either
+    // freed by vmaDestroyImage, or this resource does not own the underlying
+    // memory
+    memory_.allocation = VK_NULL_HANDLE;
+  }
+}
+
+void VulkanImage::create_image_view() {
+  VmaAllocatorInfo allocator_info{};
+  vmaGetAllocatorInfo(allocator_, &allocator_info);
+
+  const VkComponentMapping component_mapping{
+      VK_COMPONENT_SWIZZLE_IDENTITY, // r
+      VK_COMPONENT_SWIZZLE_IDENTITY, // g
+      VK_COMPONENT_SWIZZLE_IDENTITY, // b
+      VK_COMPONENT_SWIZZLE_IDENTITY, // a
+  };
+
+  const VkImageSubresourceRange subresource_range{
+      VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
+      0u, // baseMipLevel
+      VK_REMAINING_MIP_LEVELS, // levelCount
+      0u, // baseArrayLayer
+      VK_REMAINING_ARRAY_LAYERS, // layerCount
+  };
+
+  const VkImageViewCreateInfo image_view_create_info{
+      VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, // sType
+      nullptr, // pNext
+      0u, // flags
+      handles_.image, // image
+      view_properties_.view_type, // viewType
+      view_properties_.view_format, // format
+      component_mapping, // components
+      subresource_range, // subresourceRange
+  };
+
+  VK_CHECK(vkCreateImageView(
+      allocator_info.device,
+      &(image_view_create_info),
+      nullptr,
+      &(handles_.image_view)));
+}
+
+VkMemoryRequirements VulkanImage::get_memory_requirements() const {
+  VkMemoryRequirements memory_requirements;
+  vkGetImageMemoryRequirements(
+      this->device(), handles_.image, &memory_requirements);
+  return memory_requirements;
+}
+
+//
+// ImageMemoryBarrier
+//
+
+ImageMemoryBarrier::ImageMemoryBarrier(
+    const VkAccessFlags src_access_flags,
+    const VkAccessFlags dst_access_flags,
+    const VkImageLayout src_layout_flags,
+    const VkImageLayout dst_layout_flags,
+    const VulkanImage& image)
+    : handle{
+          VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, // sType
+          nullptr, // pNext
+          src_access_flags, // srcAccessMask
+          dst_access_flags, // dstAccessMask
+          src_layout_flags, // oldLayout
+          dst_layout_flags, // newLayout
+          VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex
+          VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex
+          image.handles_.image, // image
+          {
+              // subresourceRange
+              VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
+              0u, // baseMipLevel
+              VK_REMAINING_MIP_LEVELS, // levelCount
+              0u, // baseArrayLayer
+              VK_REMAINING_ARRAY_LAYERS, // layerCount
+          },
+      } {}
+
+//
+// SamplerCache
+//
+
+SamplerCache::SamplerCache(VkDevice device)
+    : cache_mutex_{}, device_(device), cache_{} {}
+
+SamplerCache::SamplerCache(SamplerCache&& other) noexcept
+    : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) {
+  std::lock_guard<std::mutex> lock(other.cache_mutex_);
+}
+
+SamplerCache::~SamplerCache() {
+  purge();
+}
+
+VkSampler SamplerCache::retrieve(const SamplerCache::Key& key) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+  auto it = cache_.find(key);
+  if (cache_.cend() == it) {
+    it = cache_.insert({key, SamplerCache::Value(device_, key)}).first;
+  }
+
+  return it->second.handle();
+}
+
+void SamplerCache::purge() {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+  cache_.clear();
+}
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/memory/Image.h b/backends/vulkan/runtime/api/memory/Image.h
new file mode 100644
index 00000000000..e3f4d7437df
--- /dev/null
+++ b/backends/vulkan/runtime/api/memory/Image.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+
+#include <executorch/backends/vulkan/runtime/api/vk_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/Utils.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/vma_api.h>
+
+#include <executorch/backends/vulkan/runtime/api/memory/Allocation.h>
+
+#include <mutex>
+#include <unordered_map>
+
+namespace vkcompute {
+namespace api {
+
+class ImageSampler final {
+ public:
+  struct Properties final {
+    VkFilter filter;
+    VkSamplerMipmapMode mipmap_mode;
+    VkSamplerAddressMode address_mode;
+    VkBorderColor border_color;
+  };
+
+  explicit ImageSampler(VkDevice, const Properties&);
+
+  ImageSampler(const ImageSampler&) = delete;
+  ImageSampler& operator=(const ImageSampler&) = delete;
+
+  ImageSampler(ImageSampler&&) noexcept;
+  ImageSampler& operator=(ImageSampler&&) = delete;
+
+  ~ImageSampler();
+
+ private:
+  VkDevice device_;
+  VkSampler handle_;
+
+ public:
+  VkSampler handle() const {
+    return handle_;
+  }
+
+  struct Hasher {
+    size_t operator()(const Properties&) const;
+  };
+
+  // We need to define a custom swap function since this class
+  // does not allow for move assignment. The swap function will
+  // be used in the hash map.
+  friend void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept;
+};
+
+class VulkanImage final {
+ public:
+  struct ImageProperties final {
+    VkImageType image_type;
+    VkFormat image_format;
+    VkExtent3D image_extents;
+    VkImageUsageFlags image_usage;
+  };
+
+  struct ViewProperties final {
+    VkImageViewType view_type;
+    VkFormat view_format;
+  };
+
+  using SamplerProperties = ImageSampler::Properties;
+
+  struct Handles final {
+    VkImage image;
+    VkImageView image_view;
+    VkSampler sampler;
+  };
+
+  explicit VulkanImage();
+
+  explicit VulkanImage(
+      const VmaAllocator,
+      const VmaAllocationCreateInfo&,
+      const ImageProperties&,
+      const ViewProperties&,
+      const SamplerProperties&,
+      const VkImageLayout layout,
+      VkSampler,
+      const bool allocate_memory = true);
+
+  VulkanImage(const VulkanImage&) = delete;
+  VulkanImage& operator=(const VulkanImage&) = delete;
+
+  VulkanImage(VulkanImage&&) noexcept;
+  VulkanImage& operator=(VulkanImage&&) noexcept;
+
+  ~VulkanImage();
+
+  struct Package final {
+    VkImage handle;
+    VkImageLayout image_layout;
+    VkImageView image_view;
+    VkSampler image_sampler;
+  };
+
+  friend struct ImageMemoryBarrier;
+
+ private:
+  ImageProperties image_properties_;
+  ViewProperties view_properties_;
+  SamplerProperties sampler_properties_;
+  // The allocator object this was allocated from
+  VmaAllocator allocator_;
+  // Handles to the allocated memory
+  Allocation memory_;
+  // Indicates whether the underlying memory is owned by this resource
+  bool owns_memory_;
+  Handles handles_;
+  // Layout
+  VkImageLayout layout_;
+
+ public:
+  void create_image_view();
+
+  inline VkDevice device() const {
+    VmaAllocatorInfo allocator_info{};
+    vmaGetAllocatorInfo(allocator_, &allocator_info);
+    return allocator_info.device;
+  }
+
+  inline VmaAllocator vma_allocator() const {
+    return allocator_;
+  }
+
+  inline VmaAllocation allocation() const {
+    return memory_.allocation;
+  }
+
+  inline VmaAllocationCreateInfo allocation_create_info() const {
+    return VmaAllocationCreateInfo(memory_.create_info);
+  }
+
+  inline VkFormat format() const {
+    return image_properties_.image_format;
+  }
+
+  inline VkExtent3D extents() const {
+    return image_properties_.image_extents;
+  }
+
+  inline VkImage handle() const {
+    return handles_.image;
+  }
+
+  inline VkImageView image_view() const {
+    return handles_.image_view;
+  }
+
+  inline VkSampler sampler() const {
+    return handles_.sampler;
+  }
+
+  Package package() const {
+    return {
+        handles_.image,
+        layout_,
+        handles_.image_view,
+        handles_.sampler,
+    };
+  }
+
+  inline VkImageLayout layout() const {
+    return layout_;
+  }
+
+  inline void set_layout(const VkImageLayout layout) {
+    layout_ = layout;
+  }
+
+  inline bool has_memory() const {
+    return (memory_.allocation != VK_NULL_HANDLE);
+  }
+
+  inline bool owns_memory() const {
+    return owns_memory_;
+  }
+
+  inline operator bool() const {
+    return (handles_.image != VK_NULL_HANDLE);
+  }
+
+  inline void bind_allocation(const Allocation& memory) {
+    VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
+    VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image));
+    memory_.allocation = memory.allocation;
+
+    // Only create the image view if the image has been bound to memory
+    create_image_view();
+  }
+
+  VkMemoryRequirements get_memory_requirements() const;
+};
+
+struct ImageMemoryBarrier final {
+  VkImageMemoryBarrier handle;
+
+  ImageMemoryBarrier(
+      const VkAccessFlags src_access_flags,
+      const VkAccessFlags dst_access_flags,
+      const VkImageLayout src_layout_flags,
+      const VkImageLayout dst_layout_flags,
+      const VulkanImage& image);
+};
+
+class SamplerCache final {
+ public:
+  explicit SamplerCache(VkDevice device);
+
+  SamplerCache(const SamplerCache&) = delete;
+  SamplerCache& operator=(const SamplerCache&) = delete;
+
+  SamplerCache(SamplerCache&&) noexcept;
+  SamplerCache& operator=(SamplerCache&&) = delete;
+
+  ~SamplerCache();
+
+  using Key = ImageSampler::Properties;
+  using Value = ImageSampler;
+  using Hasher = ImageSampler::Hasher;
+
+ private:
+  // Multiple threads could potentially be adding entries into the cache, so use
+  // a mutex to manage access
+  std::mutex cache_mutex_;
+
+  VkDevice device_;
+  std::unordered_map<Key, Value, Hasher> cache_;
+
+ public:
+  VkSampler retrieve(const Key&);
+  void purge();
+};
+
+} // namespace api
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/vma_api.cpp b/backends/vulkan/runtime/api/memory/vma_api.cpp
similarity index 78%
rename from backends/vulkan/runtime/api/vma_api.cpp
rename to backends/vulkan/runtime/api/memory/vma_api.cpp
index 26672339adf..d1180305fea 100644
--- a/backends/vulkan/runtime/api/vma_api.cpp
+++ b/backends/vulkan/runtime/api/memory/vma_api.cpp
@@ -7,4 +7,4 @@
  */
 
 #define VMA_IMPLEMENTATION
-#include <executorch/backends/vulkan/runtime/api/vma_api.h>
+#include <executorch/backends/vulkan/runtime/api/memory/vma_api.h>
diff --git a/backends/vulkan/runtime/api/vma_api.h b/backends/vulkan/runtime/api/memory/vma_api.h
similarity index 100%
rename from backends/vulkan/runtime/api/vma_api.h
rename to backends/vulkan/runtime/api/memory/vma_api.h
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.h b/backends/vulkan/runtime/graph/containers/SharedObject.h
index f1e96bf0c2c..09509ad45b9 100644
--- a/backends/vulkan/runtime/graph/containers/SharedObject.h
+++ b/backends/vulkan/runtime/graph/containers/SharedObject.h
@@ -30,7 +30,7 @@ struct SharedObject {
   VkMemoryRequirements aggregate_memory_requirements;
   VmaAllocationCreateInfo aggregate_create_info;
   std::vector<ValueRef> users;
-  api::MemoryAllocation allocation;
+  api::Allocation allocation;
 
   void add_user(ComputeGraph* const graph, const ValueRef idx);
   void allocate(ComputeGraph* const graph);
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 573eaf2e1c0..d80c694e6db 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -150,10 +150,10 @@ def define_common_targets(is_fbcode = False):
         name = "vulkan_compute_api",
         compiler_flags = get_vulkan_compiler_flags(),
         srcs = native.glob([
-            "runtime/api/*.cpp",
+            "runtime/api/**/*.cpp",
         ]),
         exported_headers = native.glob([
-            "runtime/api/*.h",
+            "runtime/api/**/*.h",
         ]),
         visibility = [
             "//executorch/backends/vulkan/...",
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 37ced363b61..7bbba9108a5 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -204,7 +204,7 @@ void submit_to_gpu() {
   fence.wait();
 }
 
-api::MemoryAllocation allocate_memory_for(const vTensor& vten) {
+api::Allocation allocate_memory_for(const vTensor& vten) {
   return api::context()->adapter_ptr()->vma().create_allocation(
       vten.get_memory_requirements(), vten.get_allocation_create_info());
 }
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 168f643fe52..1a65ea04c26 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -179,7 +179,7 @@ inline int64_t get_buf_idx(
 
 void submit_to_gpu();
 
-api::MemoryAllocation allocate_memory_for(const vTensor& vten);
+api::Allocation allocate_memory_for(const vTensor& vten);
 
 VmaTotalStatistics get_vma_stats();
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 85c0a5ebb46..614a2ffcaf6 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -301,11 +301,11 @@ TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
   std::fill(data_b.begin(), data_b.end(), 1.5f);
 
   // Allocate memory at the last possible opportunity
-  api::MemoryAllocation a_mem = allocate_memory_for(a);
+  api::Allocation a_mem = allocate_memory_for(a);
   a.image().bind_allocation(a_mem);
-  api::MemoryAllocation b_mem = allocate_memory_for(b);
+  api::Allocation b_mem = allocate_memory_for(b);
   b.image().bind_allocation(b_mem);
-  api::MemoryAllocation c_mem = allocate_memory_for(c);
+  api::Allocation c_mem = allocate_memory_for(c);
   c.image().bind_allocation(c_mem);
 
   // One allocation for each tensor
@@ -341,15 +341,15 @@ TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
   EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   // a and d can share the same memory allocation
-  api::MemoryAllocation a_d_mem = allocate_memory_for(a);
+  api::Allocation a_d_mem = allocate_memory_for(a);
   a.image().bind_allocation(a_d_mem);
   d.image().bind_allocation(a_d_mem);
   // b and e can share the same memory allocation
-  api::MemoryAllocation b_e_mem = allocate_memory_for(b);
+  api::Allocation b_e_mem = allocate_memory_for(b);
   b.image().bind_allocation(b_e_mem);
   e.image().bind_allocation(b_e_mem);
   // c must have its own memory allocation
-  api::MemoryAllocation c_mem = allocate_memory_for(c);
+  api::Allocation c_mem = allocate_memory_for(c);
   c.image().bind_allocation(c_mem);
 
   // 3 allocations should be made
@@ -394,7 +394,7 @@ TEST_F(VulkanComputeAPITest, resource_bind_twice_fails) {
   vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
 
   // Try to double bind a resource, which should fail
-  api::MemoryAllocation a_mem = allocate_memory_for(a);
+  api::Allocation a_mem = allocate_memory_for(a);
   EXPECT_THROW(a.image().bind_allocation(a_mem), api::Error);
 }
 
@@ -402,9 +402,9 @@ TEST_F(VulkanComputeAPITest, resource_destructor_non_owning_memory) {
   // Check that the destructor of a vTensor that does not own its memory
   // does not free the memory
 
-  api::MemoryAllocation memory;
+  api::Allocation memory;
 
-  // Default MemoryAllocation constructor should not allocate memory
+  // Default Allocation constructor should not allocate memory
   EXPECT_TRUE(get_vma_allocation_count() == 0);
 
   std::vector<int64_t> sizes = {4, 4, 1};
@@ -464,11 +464,11 @@ TEST_F(
   vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
   vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
 
-  api::MemoryAllocation a_mem = allocate_memory_for(a);
+  api::Allocation a_mem = allocate_memory_for(a);
   a.image().bind_allocation(a_mem);
-  api::MemoryAllocation b_mem = allocate_memory_for(b);
+  api::Allocation b_mem = allocate_memory_for(b);
   b.image().bind_allocation(b_mem);
-  api::MemoryAllocation c_mem = allocate_memory_for(c);
+  api::Allocation c_mem = allocate_memory_for(c);
   c.image().bind_allocation(c_mem);
 
   execute_and_check_add(a, b, c, 4.0f, 8.0f);

From f6f881af63fd9d06380a20a810b85f92e0ac4242 Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Thu, 9 May 2024 12:22:59 -0700
Subject: [PATCH 20/62] Remove unneeded `api::` prefix within `namespace api`
 (#3554)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3554

TSIA
ghstack-source-id: 225755563
exported-using-ghexport
bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: yipjustin

Differential Revision: D57126896

fbshipit-source-id: 2c4467b07730ca3371244fb95067a3b3b654163c
---
 backends/vulkan/runtime/api/Command.cpp  | 46 ++++++++++++------------
 backends/vulkan/runtime/api/Command.h    | 40 ++++++++++-----------
 backends/vulkan/runtime/api/Context.cpp  |  2 +-
 backends/vulkan/runtime/api/Context.h    | 46 ++++++++++++------------
 backends/vulkan/runtime/api/Pipeline.cpp |  2 +-
 backends/vulkan/runtime/api/Pipeline.h   |  2 +-
 6 files changed, 68 insertions(+), 70 deletions(-)

diff --git a/backends/vulkan/runtime/api/Command.cpp b/backends/vulkan/runtime/api/Command.cpp
index 841c40e471a..9c70cfa60b2 100644
--- a/backends/vulkan/runtime/api/Command.cpp
+++ b/backends/vulkan/runtime/api/Command.cpp
@@ -133,16 +133,14 @@ void CommandBuffer::insert_barrier(PipelineBarrier& pipeline_barrier) {
     if (!pipeline_barrier.buffer_barrier_handles.empty()) {
       pipeline_barrier.buffer_barrier_handles.clear();
     }
-    for (const api::BufferMemoryBarrier& memory_barrier :
-         pipeline_barrier.buffers) {
+    for (const BufferMemoryBarrier& memory_barrier : pipeline_barrier.buffers) {
       pipeline_barrier.buffer_barrier_handles.push_back(memory_barrier.handle);
     }
 
     if (!pipeline_barrier.image_barrier_handles.empty()) {
       pipeline_barrier.image_barrier_handles.clear();
     }
-    for (const api::ImageMemoryBarrier& memory_barrier :
-         pipeline_barrier.images) {
+    for (const ImageMemoryBarrier& memory_barrier : pipeline_barrier.images) {
       pipeline_barrier.image_barrier_handles.push_back(memory_barrier.handle);
     }
     vkCmdPipelineBarrier(
@@ -185,11 +183,11 @@ void CommandBuffer::dispatch(const utils::uvec3& global_workgroup_size) {
 }
 
 void CommandBuffer::copy_buffer_to_buffer(
-    const api::VulkanBuffer& source,
-    const api::VulkanBuffer& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const VulkanBuffer& source,
+    const VulkanBuffer& destination,
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_buffer_to_buffer() on a command buffer whose state "
@@ -208,11 +206,11 @@ void CommandBuffer::copy_buffer_to_buffer(
 }
 
 void CommandBuffer::copy_texture_to_texture(
-    const api::VulkanImage& source,
-    const api::VulkanImage& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const VulkanImage& source,
+    const VulkanImage& destination,
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_texture_to_texture() on a command buffer whose state "
@@ -253,11 +251,11 @@ void CommandBuffer::copy_texture_to_texture(
 }
 
 void CommandBuffer::copy_texture_to_buffer(
-    const api::VulkanImage& source,
-    const api::VulkanBuffer& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const VulkanImage& source,
+    const VulkanBuffer& destination,
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_texture_to_buffer() on a command buffer whose state "
@@ -291,11 +289,11 @@ void CommandBuffer::copy_texture_to_buffer(
 }
 
 void CommandBuffer::copy_buffer_to_texture(
-    const api::VulkanBuffer& source,
-    const api::VulkanImage& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const VulkanBuffer& source,
+    const VulkanImage& destination,
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_buffer_to_texture() on a command buffer whose state "
diff --git a/backends/vulkan/runtime/api/Command.h b/backends/vulkan/runtime/api/Command.h
index 85d859c0702..ff009de8fc0 100644
--- a/backends/vulkan/runtime/api/Command.h
+++ b/backends/vulkan/runtime/api/Command.h
@@ -94,32 +94,32 @@ class CommandBuffer final {
   void dispatch(const utils::uvec3&);
 
   void copy_buffer_to_buffer(
-      const api::VulkanBuffer&,
-      const api::VulkanBuffer&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&);
+      const VulkanBuffer&,
+      const VulkanBuffer&,
+      const utils::uvec3&,
+      const utils::uvec3&,
+      const utils::uvec3&);
 
   void copy_texture_to_texture(
-      const api::VulkanImage&,
-      const api::VulkanImage&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&);
+      const VulkanImage&,
+      const VulkanImage&,
+      const utils::uvec3&,
+      const utils::uvec3&,
+      const utils::uvec3&);
 
   void copy_texture_to_buffer(
-      const api::VulkanImage&,
-      const api::VulkanBuffer&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&);
+      const VulkanImage&,
+      const VulkanBuffer&,
+      const utils::uvec3&,
+      const utils::uvec3&,
+      const utils::uvec3&);
 
   void copy_buffer_to_texture(
-      const api::VulkanBuffer&,
-      const api::VulkanImage&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&);
+      const VulkanBuffer&,
+      const VulkanImage&,
+      const utils::uvec3&,
+      const utils::uvec3&,
+      const utils::uvec3&);
 
   void write_timestamp(VkQueryPool, const uint32_t) const;
   void reset_querypool(VkQueryPool, const uint32_t, const uint32_t) const;
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 9f6cdabf396..99d9ab0aa5d 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -236,7 +236,7 @@ UniformParamsBuffer& UniformParamsBuffer::operator=(
 }
 
 ParamsBindList::ParamsBindList(
-    std::initializer_list<const api::BufferBindInfo> init_list) {
+    std::initializer_list<const BufferBindInfo> init_list) {
   bind_infos.resize(init_list.size());
   std::copy(init_list.begin(), init_list.end(), bind_infos.begin());
 }
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index 5bc4506628b..f8bc923f394 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -196,9 +196,9 @@ class Context final {
       PipelineBarrier&,
       const S&,
       const D&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&,
-      const api::utils::uvec3&,
+      const utils::uvec3&,
+      const utils::uvec3&,
+      const utils::uvec3&,
       VkFence fence_handle);
 
   template <typename... Arguments>
@@ -267,9 +267,9 @@ class UniformParamsBuffer final {
 };
 
 struct ParamsBindList final {
-  std::vector<api::BufferBindInfo> bind_infos;
+  std::vector<BufferBindInfo> bind_infos;
 
-  ParamsBindList(std::initializer_list<const api::BufferBindInfo> init_list);
+  ParamsBindList(std::initializer_list<const BufferBindInfo> init_list);
 };
 
 class StorageBuffer final {
@@ -376,18 +376,18 @@ inline void record_copy(
     CommandBuffer& cmd,
     const S& source,
     const D& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) = delete;
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) = delete;
 
 template <>
 inline void record_copy<VulkanBuffer, VulkanBuffer>(
     CommandBuffer& cmd,
     const VulkanBuffer& source,
     const VulkanBuffer& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   cmd.copy_buffer_to_buffer(
       source, destination, copy_range, src_offset, dst_offset);
 }
@@ -397,9 +397,9 @@ inline void record_copy<VulkanImage, VulkanImage>(
     CommandBuffer& cmd,
     const VulkanImage& source,
     const VulkanImage& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   cmd.copy_texture_to_texture(
       source, destination, copy_range, src_offset, dst_offset);
 }
@@ -409,9 +409,9 @@ inline void record_copy<VulkanImage, VulkanBuffer>(
     CommandBuffer& cmd,
     const VulkanImage& source,
     const VulkanBuffer& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   cmd.copy_texture_to_buffer(
       source, destination, copy_range, src_offset, dst_offset);
 }
@@ -421,9 +421,9 @@ inline void record_copy<VulkanBuffer, VulkanImage>(
     CommandBuffer& cmd,
     const VulkanBuffer& source,
     const VulkanImage& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset) {
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset) {
   cmd.copy_buffer_to_texture(
       source, destination, copy_range, src_offset, dst_offset);
 }
@@ -440,9 +440,9 @@ inline bool Context::submit_copy(
     PipelineBarrier& pipeline_barrier,
     const S& source,
     const D& destination,
-    const api::utils::uvec3& copy_range,
-    const api::utils::uvec3& src_offset,
-    const api::utils::uvec3& dst_offset,
+    const utils::uvec3& copy_range,
+    const utils::uvec3& src_offset,
+    const utils::uvec3& dst_offset,
     VkFence fence_handle) {
   // If any of the provided arguments does not have memory associated with it,
   // then exit early as there is no work to be done. However, if a fence has
diff --git a/backends/vulkan/runtime/api/Pipeline.cpp b/backends/vulkan/runtime/api/Pipeline.cpp
index f4be0039e67..bc5d46af21c 100644
--- a/backends/vulkan/runtime/api/Pipeline.cpp
+++ b/backends/vulkan/runtime/api/Pipeline.cpp
@@ -137,7 +137,7 @@ uint32_t SpecVar::val_size() const {
 }
 
 uint32_t SpecVar::val_offset() const {
-  return api::utils::safe_downcast<uint32_t>(offsetof(SpecVar, value));
+  return utils::safe_downcast<uint32_t>(offsetof(SpecVar, value));
 }
 
 bool operator==(const SpecVar& lhs, const SpecVar& rhs) {
diff --git a/backends/vulkan/runtime/api/Pipeline.h b/backends/vulkan/runtime/api/Pipeline.h
index 351c8be918a..118a67e37d5 100644
--- a/backends/vulkan/runtime/api/Pipeline.h
+++ b/backends/vulkan/runtime/api/Pipeline.h
@@ -71,7 +71,7 @@ class SpecVarList final {
   }
 
   inline uint32_t size() const {
-    return api::utils::safe_downcast<uint32_t>(vars.size());
+    return utils::safe_downcast<uint32_t>(vars.size());
   }
 
   inline uint32_t data_nbytes() const {

From ebdb152aba547f17ef2952d105658e16afb70a0b Mon Sep 17 00:00:00 2001
From: Jorge Pineda <jorgep31415@meta.com>
Date: Thu, 9 May 2024 12:22:59 -0700
Subject: [PATCH 21/62] Save and load VkPipelineCache data if path is specified
 (#3546)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3546

## Context
Pipeline creation involves the compilation of shader SPIR-V code into machine-specific code. This makes the application's first model-load via the `Program::load_method()` ET-API very slow, due to the creation of compute pipelines via the `vkCreateComputePipelines()` VK-API. To amortize that cost, Vulkan offers a [Compute Pipeline Cache API](https://docs.vulkan.org/guide/latest/pipeline_cache.html). Following [this Vulkan example](https://github.com/KhronosGroup/Vulkan-Samples/tree/main/samples/performance/pipeline_cache), we can (A) retrieve the compiled machine-specific code saving it to a file and (B) load it to a file next time. For an internal model executing on a resource-constrained device, this improves model-load time from ~1200ms to ~500ms.

## This change
We implement both (A)+(B) ET-VK logic. Note that these changes are actually no-op unless you initialize the `pipeline_cache_file_path` manually. The expectation is for the client application to specify the file path of their pipeline cache data if they want to leverage this optimization. In a future ET-wide change, we will expose the file_path config parameter to the ET-API.
ghstack-source-id: 225763792
bypass-github-export-checks
bypass-github-pytorch-ci-checks
bypass-github-executorch-ci-checks

Reviewed By: SS-JIA

Differential Revision: D57085276

fbshipit-source-id: 993dc55d5930c913884ad455f359b62afb75bf87
---
 backends/vulkan/runtime/api/Adapter.cpp  |  5 ++-
 backends/vulkan/runtime/api/Adapter.h    |  3 +-
 backends/vulkan/runtime/api/Pipeline.cpp | 54 +++++++++++++++++++++---
 backends/vulkan/runtime/api/Pipeline.h   |  8 +++-
 backends/vulkan/runtime/api/Runtime.cpp  |  7 ++-
 backends/vulkan/runtime/api/Runtime.h    |  1 +
 6 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/backends/vulkan/runtime/api/Adapter.cpp b/backends/vulkan/runtime/api/Adapter.cpp
index 5db2642e3ec..932678f18fc 100644
--- a/backends/vulkan/runtime/api/Adapter.cpp
+++ b/backends/vulkan/runtime/api/Adapter.cpp
@@ -292,7 +292,8 @@ DeviceHandle::~DeviceHandle() {
 Adapter::Adapter(
     VkInstance instance,
     PhysicalDevice physical_device,
-    const uint32_t num_queues)
+    const uint32_t num_queues,
+    const std::string& cache_data_path)
     : queue_usage_mutex_{},
       physical_device_(std::move(physical_device)),
       queues_{},
@@ -307,7 +308,7 @@ Adapter::Adapter(
       shader_layout_cache_(device_.handle_),
       shader_cache_(device_.handle_),
       pipeline_layout_cache_(device_.handle_),
-      compute_pipeline_cache_(device_.handle_),
+      compute_pipeline_cache_(device_.handle_, cache_data_path),
       sampler_cache_(device_.handle_),
       vma_(instance_, physical_device_.handle, device_.handle_) {}
 
diff --git a/backends/vulkan/runtime/api/Adapter.h b/backends/vulkan/runtime/api/Adapter.h
index ef246260021..fcbba281642 100644
--- a/backends/vulkan/runtime/api/Adapter.h
+++ b/backends/vulkan/runtime/api/Adapter.h
@@ -101,7 +101,8 @@ class Adapter final {
   explicit Adapter(
       VkInstance instance,
       PhysicalDevice physical_device,
-      const uint32_t num_queues);
+      const uint32_t num_queues,
+      const std::string& cache_data_path);
 
   Adapter(const Adapter&) = delete;
   Adapter& operator=(const Adapter&) = delete;
diff --git a/backends/vulkan/runtime/api/Pipeline.cpp b/backends/vulkan/runtime/api/Pipeline.cpp
index bc5d46af21c..a6bff47cac1 100644
--- a/backends/vulkan/runtime/api/Pipeline.cpp
+++ b/backends/vulkan/runtime/api/Pipeline.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/vulkan/runtime/api/Pipeline.h>
 
+#include <fstream>
+
 namespace vkcompute {
 namespace api {
 
@@ -358,17 +360,24 @@ void PipelineLayoutCache::purge() {
 // ComputePipelineCache
 //
 
-ComputePipelineCache::ComputePipelineCache(VkDevice device)
+ComputePipelineCache::ComputePipelineCache(
+    VkDevice device,
+    const std::string& cache_data_path)
     : cache_mutex_{},
       device_(device),
       pipeline_cache_{VK_NULL_HANDLE},
-      cache_{} {
-  const VkPipelineCacheCreateInfo pipeline_cache_create_info{
+      cache_{},
+      cache_data_path_(cache_data_path) {
+  VkPipelineCacheCreateInfo pipeline_cache_create_info{};
+
+  auto buffer = load_cache();
+
+  pipeline_cache_create_info = {
       VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // sType
       nullptr, // pNext
       0u, // flags
-      0u, // initialDataSize
-      nullptr, // pInitialData
+      buffer.size(), // initialDataSize
+      buffer.data(), // pInitialData
   };
 
   VK_CHECK(vkCreatePipelineCache(
@@ -392,6 +401,9 @@ ComputePipelineCache::~ComputePipelineCache() {
   if (VK_NULL_HANDLE == pipeline_cache_) {
     return;
   }
+
+  save_cache();
+
   vkDestroyPipelineCache(device_, pipeline_cache_, nullptr);
   pipeline_cache_ = VK_NULL_HANDLE;
 }
@@ -416,5 +428,37 @@ void ComputePipelineCache::purge() {
   cache_.clear();
 }
 
+std::vector<char> ComputePipelineCache::load_cache() {
+  // Return if path is not specified; this means the optimization is disabled
+  if (cache_data_path_.empty()) {
+    return {};
+  }
+
+  // Return if file doesn't exist; this is expected on the first model-load
+  std::ifstream file(cache_data_path_, std::ios::binary | std::ios::ate);
+  if (file.fail()) {
+    return {};
+  }
+
+  auto size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(size);
+  file.read(buffer.data(), size);
+
+  return buffer;
+}
+
+void ComputePipelineCache::save_cache() {
+  size_t size{};
+  vkGetPipelineCacheData(device_, pipeline_cache_, &size, nullptr);
+
+  std::vector<char> buffer(size);
+  vkGetPipelineCacheData(device_, pipeline_cache_, &size, buffer.data());
+
+  std::ofstream file(cache_data_path_, std::ios::binary);
+  file.write(buffer.data(), buffer.size());
+}
+
 } // namespace api
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Pipeline.h b/backends/vulkan/runtime/api/Pipeline.h
index 118a67e37d5..35b3b6275b4 100644
--- a/backends/vulkan/runtime/api/Pipeline.h
+++ b/backends/vulkan/runtime/api/Pipeline.h
@@ -216,7 +216,9 @@ class PipelineLayoutCache final {
 
 class ComputePipelineCache final {
  public:
-  explicit ComputePipelineCache(VkDevice device);
+  explicit ComputePipelineCache(
+      VkDevice device,
+      const std::string& cache_data_path);
 
   ComputePipelineCache(const ComputePipelineCache&) = delete;
   ComputePipelineCache& operator=(const ComputePipelineCache&) = delete;
@@ -266,6 +268,9 @@ class ComputePipelineCache final {
   };
 
  private:
+  std::vector<char> load_cache();
+  void save_cache();
+
   // Multiple threads could potentially be adding entries into the cache, so use
   // a mutex to manage access
   std::mutex cache_mutex_;
@@ -273,6 +278,7 @@ class ComputePipelineCache final {
   VkDevice device_;
   VkPipelineCache pipeline_cache_;
   std::unordered_map<Key, Value, Hasher> cache_;
+  const std::string cache_data_path_;
 
  public:
   VkPipeline retrieve(const Key&);
diff --git a/backends/vulkan/runtime/api/Runtime.cpp b/backends/vulkan/runtime/api/Runtime.cpp
index ebed34162f3..432af326a53 100644
--- a/backends/vulkan/runtime/api/Runtime.cpp
+++ b/backends/vulkan/runtime/api/Runtime.cpp
@@ -253,12 +253,14 @@ std::unique_ptr<Runtime> init_global_vulkan_runtime() {
 #endif /* VULKAN_DEBUG */
   const bool init_default_device = true;
   const uint32_t num_requested_queues = 1; // TODO: raise this value
+  const std::string cache_data_path = ""; // TODO: expose to client
 
   const RuntimeConfiguration default_config{
       enable_validation_messages,
       init_default_device,
       AdapterSelector::First,
       num_requested_queues,
+      cache_data_path,
   };
 
   try {
@@ -351,7 +353,10 @@ uint32_t Runtime::create_adapter(const Selector& selector) {
   // Otherwise, create an adapter for the selected physical device
   adapter_i = utils::safe_downcast<int32_t>(adapters_.size());
   adapters_.emplace_back(new Adapter(
-      instance_, device_mapping.first, config_.num_requested_queues));
+      instance_,
+      device_mapping.first,
+      config_.num_requested_queues,
+      config_.cache_data_path));
   device_mapping.second = adapter_i;
 
   return adapter_i;
diff --git a/backends/vulkan/runtime/api/Runtime.h b/backends/vulkan/runtime/api/Runtime.h
index 6cfcc0ca03a..e4cb6922ad8 100644
--- a/backends/vulkan/runtime/api/Runtime.h
+++ b/backends/vulkan/runtime/api/Runtime.h
@@ -39,6 +39,7 @@ struct RuntimeConfiguration final {
   bool init_default_device;
   AdapterSelector default_selector;
   uint32_t num_requested_queues;
+  std::string cache_data_path;
 };
 
 class Runtime final {

From cc2d3b57420c9dd73abf458c8cc45e099d9e8a7b Mon Sep 17 00:00:00 2001
From: Yupeng Zhang <yupeng@meta.com>
Date: Thu, 9 May 2024 14:37:40 -0700
Subject: [PATCH 22/62] Revise memory monitor to align with Xcode metrics
 (#3131)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3131

Ref: https://forums.developer.apple.com/forums/thread/105088

*If you’re going to record a single number, this footprint value is a good one to use. I don’t think we guarantee that it’ll align with the Xcode memory gauge, but it’s much more useful value than all the older stuff (like resident_size)*

Therefore revise it.

Reviewed By: shoumikhin

Differential Revision: D56290391

fbshipit-source-id: f3f408f2677c3788a25e46af0df0eccd23a21c2f
---
 .../LLaMA/LLaMA/Application/ResourceMonitor.swift         | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceMonitor.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceMonitor.swift
index 847eb51bae3..3ec16463e8a 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceMonitor.swift
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ResourceMonitor.swift
@@ -33,16 +33,16 @@ final class ResourceMonitor: ObservableObject {
   }
 
   private func usedMemoryInMB() -> Int {
-    var info = mach_task_basic_info()
-    var count = mach_msg_type_number_t(MemoryLayout<mach_task_basic_info>.size) / 4
+    var info = task_vm_info_data_t()
+    var count = mach_msg_type_number_t(MemoryLayout<task_vm_info>.size) / 4
 
     let kerr: kern_return_t = withUnsafeMutablePointer(to: &info) {
       $0.withMemoryRebound(to: integer_t.self, capacity: Int(count)) {
-        task_info(mach_task_self_, task_flavor_t(MACH_TASK_BASIC_INFO), $0, &count)
+        task_info(mach_task_self_, task_flavor_t(TASK_VM_INFO), $0, &count)
       }
     }
     guard kerr == KERN_SUCCESS else { return 0 }
-    return Int(info.resident_size / 0x100000)
+    return Int(info.phys_footprint / 0x100000)
   }
 
   private func availableMemoryInMB() -> Int {

From 2ac7f2a84ddb3ad573af5488a03f5afa6d342318 Mon Sep 17 00:00:00 2001
From: Tarun Karuturi <tkaruturi@meta.com>
Date: Thu, 9 May 2024 15:13:43 -0700
Subject: [PATCH 23/62] Add support for method level executorch backend config
 (#3266)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3266

There are use cases where we might like to supply a separate ExecutorchBackendConfig for each method in the model. An example use case is where we might want to alloc inputs for one method and not alloc them for another. In order to support this, in this diff we add support for passing in a dictionary of configs to `to_executorch`.

Reviewed By: JacobSzwejbka, cccclai

Differential Revision: D56499598

fbshipit-source-id: e02947597e4f898c7b4963b5922904f3b642a5e5
---
 exir/capture/_config.py           |  9 +++++--
 exir/program/_program.py          | 17 +++++++++----
 exir/program/test/test_program.py | 40 +++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index c03be0e24f3..dd0ed94094f 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import Dict, List, Optional, Union
 
 from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode
 from executorch.exir.pass_manager import PassType
@@ -45,7 +45,12 @@ class EdgeCompileConfig:
 @dataclass
 class ExecutorchBackendConfig:
     passes: List[PassType] = field(default_factory=list)
-    memory_planning_pass: PassType = MemoryPlanningPass("greedy")
+
+    # A single memory planning pass can be defined for all the programs in the
+    # EdgeProgramManager or can be defined per program.
+    memory_planning_pass: Union[PassType, Dict[str, PassType]] = MemoryPlanningPass(
+        "greedy"
+    )
     to_out_var_pass: PassType = ToOutVarPass(ignore_to_out_var_failure=False)
     dynamic_memory_planning_mode: DynamicMemoryPlanningMode = (
         DynamicMemoryPlanningMode.UPPER_BOUND
diff --git a/exir/program/_program.py b/exir/program/_program.py
index f2c2a5438fd..c5afe011691 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -412,7 +412,7 @@ def to_executorch(
         # Existing user passes dont use run so Im just cheating here because they dont need to work on mutable buffers yet.
         # After exir.capture is gone I will clean up the memory planning infra to be consistent.
         # Frankly all of exir has big code quality issues because of the migrations that need to be addressed.
-        new_gm_res = config.memory_planning_pass(new_gm)  # pyre-ignore[19]
+        new_gm_res = config.memory_planning_pass(new_gm)  # pyre-ignore[29]
         assert new_gm_res is not None
         new_gm = new_gm_res.graph_module
         new_prog = ExirExportedProgram(
@@ -889,7 +889,8 @@ def to_backend(
         )
 
     def to_executorch(
-        self, config: Optional[ExecutorchBackendConfig] = None
+        self,
+        config: Optional[ExecutorchBackendConfig] = None,
     ) -> "ExecutorchProgramManager":
         """
         Transforms the program to the ExecuTorch backend.
@@ -926,13 +927,19 @@ def to_executorch(
                     # TODO(who?)
                     p.update_placeholder_tensor_specs(program, new_gm)
 
+            if isinstance(config.memory_planning_pass, dict):
+                memory_planning_pass = config.memory_planning_pass.get(
+                    name, ExecutorchBackendConfig().memory_planning_pass
+                )
+            else:
+                memory_planning_pass = config.memory_planning_pass
             # TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work
-            if hasattr(config.memory_planning_pass, "run"):
-                new_gm_res = config.memory_planning_pass.run(  # pyre-ignore[16]
+            if hasattr(memory_planning_pass, "run"):
+                new_gm_res = memory_planning_pass.run(  # pyre-ignore[16]
                     new_gm, new_signature
                 )
             else:
-                new_gm_res = config.memory_planning_pass(new_gm)  # pyre-ignore[19]
+                new_gm_res = memory_planning_pass(new_gm)  # pyre-ignore[29]
             assert new_gm_res is not None
             new_gm = new_gm_res.graph_module
 
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 51f0fcf0788..f84f0c1cd02 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -16,6 +16,7 @@
 from executorch.exir.error import ExportError
 from executorch.exir.lowered_backend_module import get_lowered_submodules
 from executorch.exir.pass_base import ExportPass
+from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.program._program import (
     EdgeProgramManager,
     ExecutorchProgramManager,
@@ -160,6 +161,45 @@ def test_executorch_manager_basic_api(self):
             3,
         )
 
+    def test_executorch_manager_multi_config(self):
+        def get_executorch_memory_planning_passes() -> Dict[str, MemoryPlanningPass]:
+            return {
+                "forward": MemoryPlanningPass(
+                    memory_planning_algo="greedy",
+                    alloc_graph_input=True,
+                    alloc_graph_output=False,
+                ),
+                "foo": MemoryPlanningPass(
+                    memory_planning_algo="greedy",
+                    alloc_graph_input=False,
+                    alloc_graph_output=True,
+                ),
+            }
+
+        executorch_manager: ExecutorchProgramManager = to_edge(
+            get_exported_programs(), get_config_methods()
+        ).to_executorch(
+            ExecutorchBackendConfig(
+                memory_planning_pass=get_executorch_memory_planning_passes()
+            )
+        )
+
+        method = executorch_manager._emitter_output.program.execution_plan[0]
+        if method.name == "forward":
+            for input_val in method.inputs:
+                evalue = method.values[input_val]
+                self.assertEqual(evalue.val.allocation_info, None)
+            for output_val in method.outputs:
+                evalue = method.values[output_val]
+                self.assertNotEqual(evalue.val.allocation_info, None)
+        else:
+            for input_val in method.inputs:
+                evalue = method.values[input_val]
+                self.assertEqual(evalue.val.allocation_info, None)
+            for output_val in method.outputs:
+                evalue = method.values[output_val]
+                self.assertNotEqual(evalue.val.allocation_info, None)
+
     def test_no_getattr(self):
         class Mul(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:

From 6c5612201249243d7c483bf563529104cbedf2cf Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Thu, 9 May 2024 15:18:22 -0700
Subject: [PATCH 24/62] Break flag bc (#3128)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3128

XNNPACK broke Backwards Compatibility by forcing pooling operations to reduce dims, and introducing a flag to allow these operation to keep the dims.

This is backwards breaking because previously XNNPACK would not keep the dims if no flag was given, now a flag must be specified to keep the dims. While initially we proposed the inverse to maintain backwards compatibility, they have encountered breakages and have decided to commit to this breakage. As we are a downstream dependency, and will have to accept this breakages ourselves, it is important that we break early before this is used in any production code.

As a result we break BC Here by accepting XNNPACK's change.

```
git diff roll_back_commit > rollback.patch
cd fbsource/fbcode/xplat/third-party/XNNPACK/XNNPACK
git apply ../../../../../../rollback.patch
```

We have to update the change in ExecuTorch, as a result we change the XNNPACK dep we are pointing to the branch containing these changes here:
https://github.com/digantdesai/XNNPACK/commits/et_v21/

Reviewed By: digantdesai

Differential Revision: D56271242

fbshipit-source-id: f05a0c98cb3b8e0b52ded9480ae5fb3ac71bbc14
---
 backends/xnnpack/operators/op_avg_pooling2d.py |  3 ++-
 backends/xnnpack/operators/op_max_pool2d.py    |  3 ++-
 backends/xnnpack/operators/op_mean_dim.py      |  3 ++-
 backends/xnnpack/third-party/XNNPACK           |  2 +-
 backends/xnnpack/utils/xnnpack_constants.py    | 12 ++++++++----
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/backends/xnnpack/operators/op_avg_pooling2d.py b/backends/xnnpack/operators/op_avg_pooling2d.py
index 18f981cb330..94cd06cc08e 100644
--- a/backends/xnnpack/operators/op_avg_pooling2d.py
+++ b/backends/xnnpack/operators/op_avg_pooling2d.py
@@ -16,6 +16,7 @@
     XNNGraph,
     XNode,
 )
+from executorch.backends.xnnpack.utils.xnnpack_constants import XNN_FLAG_KEEP_DIMS
 
 
 @register_node_visitor
@@ -67,7 +68,7 @@ def define_node(
                 dilation_width=0,  # Unused
                 input_id=input_id,
                 output_id=output_id,
-                flags=0,
+                flags=XNN_FLAG_KEEP_DIMS,
             ),
             debug_handle=debug_handle,
         )
diff --git a/backends/xnnpack/operators/op_max_pool2d.py b/backends/xnnpack/operators/op_max_pool2d.py
index 6fb49d30d57..d1a010295ef 100644
--- a/backends/xnnpack/operators/op_max_pool2d.py
+++ b/backends/xnnpack/operators/op_max_pool2d.py
@@ -18,6 +18,7 @@
     XNNMaxPooling2d,
     XNode,
 )
+from executorch.backends.xnnpack.utils.xnnpack_constants import XNN_FLAG_KEEP_DIMS
 
 
 @register_node_visitor
@@ -80,7 +81,7 @@ def define_node(
             kwargs["dilation_height"] = dilation[0]
             kwargs["dilation_width"] = dilation[1]
 
-        kwargs["flags"] = 0
+        kwargs["flags"] = XNN_FLAG_KEEP_DIMS
 
         ser_node = XNode(
             xnode_union=XNNMaxPooling2d(
diff --git a/backends/xnnpack/operators/op_mean_dim.py b/backends/xnnpack/operators/op_mean_dim.py
index fe9f2249631..663606a8880 100644
--- a/backends/xnnpack/operators/op_mean_dim.py
+++ b/backends/xnnpack/operators/op_mean_dim.py
@@ -18,6 +18,7 @@
     XNNGraph,
     XNode,
 )
+from executorch.backends.xnnpack.utils.xnnpack_constants import XNN_FLAG_KEEP_DIMS
 
 
 @register_node_visitor
@@ -70,7 +71,7 @@ def define_node(
 
         ser_node = XNode(
             xnode_union=XNNGlobalAvgPooling2d(
-                input_id=input_id, output_id=output_id, flags=0
+                input_id=input_id, output_id=output_id, flags=XNN_FLAG_KEEP_DIMS
             ),
             debug_handle=debug_handle,
         )
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index 70bbd07c1de..20c0d886fb7 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 70bbd07c1de310a1f89379c746b8f24a506c3283
+Subproject commit 20c0d886fb78d6497362e8303b999bf5d67aaa02
diff --git a/backends/xnnpack/utils/xnnpack_constants.py b/backends/xnnpack/utils/xnnpack_constants.py
index 63c8d6fdeef..351cc8ad897 100644
--- a/backends/xnnpack/utils/xnnpack_constants.py
+++ b/backends/xnnpack/utils/xnnpack_constants.py
@@ -8,21 +8,25 @@
 UINT32_MAX = 4294967295
 XNN_EXTRA_BYTES = 16
 XNN_MAX_TENSOR_DIMS = 6
-XNN_FLAG_SPARSE_INFERENCE = 0x00000001
-XNN_FLAG_HINT_SPARSE_INFERENCE = XNN_FLAG_SPARSE_INFERENCE
-XNN_FLAG_FP16_INFERENCE = 0x00000002
-XNN_FLAG_HINT_FP16_INFERENCE = XNN_FLAG_FP16_INFERENCE
+XNN_FLAG_HINT_SPARSE_INFERENCE = 0x00000001
+XNN_FLAG_HINT_FP16_INFERENCE = 0x00000002
 XNN_FLAG_FORCE_FP16_INFERENCE = 0x00000004
 XNN_FLAG_BASIC_PROFILING = 0x00000008
+XNN_FLAG_JIT = 0x00000010
 XNN_FLAG_DEPTHWISE_CONVOLUTION = 0x00000001
 XNN_FLAG_TRANSPOSE_WEIGHTS = 0x00000001
 XNN_FLAG_INPUT_NHWC = 0x00000002
 XNN_FLAG_TENSORFLOW_SAME_PADDING = 0x00000004
+XNN_FLAG_TRANSPOSE_B = XNN_FLAG_TRANSPOSE_WEIGHTS
+XNN_FLAG_TRANSPOSE_A = 0x00000002
 XNN_FLAG_TENSORFLOW_RESHAPE_2D = 0x00000004
 XNN_FLAG_TENSORFLOW_LEGACY_MODE = 0x00000004
 XNN_FLAG_FP32_STATIC_WEIGHTS = 0x00000008
 XNN_FLAG_ALIGN_CORNERS = 0x00000008
 XNN_FLAG_YIELD_WORKERS = 0x00000010
+XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER = 0x00000020
+XNN_FLAG_KEEP_DIMS = 0x00000040
+XNN_EXTRA_QUANTIZATION_PARAMS = 8
 XNN_VALUE_FLAG_EXTERNAL_INPUT = 0x00000001
 XNN_VALUE_FLAG_EXTERNAL_OUTPUT = 0x00000002
 XNN_VALUE_FLAG_PERSISTENT = 0x00000004

From 8a63430b121a32d0a2027718e319164266b6e21d Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Thu, 9 May 2024 15:18:22 -0700
Subject: [PATCH 25/62] add per-channel tests for linear (#3551)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3551

Adding a test for qc8 linear

Reviewed By: digantdesai

Differential Revision: D55941565

fbshipit-source-id: ecc870dbd879e00790a1052aaf3b4be748b02c94
---
 backends/xnnpack/test/ops/linear.py | 33 ++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py
index 85b760e38ad..06ca72e377c 100644
--- a/backends/xnnpack/test/ops/linear.py
+++ b/backends/xnnpack/test/ops/linear.py
@@ -48,6 +48,18 @@ def test_fp32_linear(self):
                     num_batch_dims=num_batch_dims,
                 )
 
+    def test_qc8_linear(self):
+        for use_bias in (True, False):
+            for num_batch_dims in range(1, 3):
+                self._test_linear(
+                    lambda in_size, out_size: torch.nn.Linear(
+                        in_size, out_size, bias=use_bias  # noqa
+                    ),
+                    uses_bias=use_bias,
+                    quant_type="per_channel",
+                    num_batch_dims=num_batch_dims,
+                )
+
     def test_fp32_addmm(self):
         """
         Note that the ConvertToLinear pass requires the weight matrix to be transposed.
@@ -107,7 +119,7 @@ def forward(self, x):
                     ),
                     num_batch_dims=num_batch_dims,
                     uses_bias=use_bias,
-                    quant=True,
+                    quant_type="per_tensor",
                 )
 
     def test_qs8_linear(self):
@@ -119,6 +131,7 @@ def test_qs8_linear(self):
                     ),
                     uses_bias=use_bias,
                     num_batch_dims=num_batch_dims,
+                    quant_type="per_tensor",
                 )
 
     @unittest.skip("XNNPACK currently only supports per-channel dynamic quantization.")
@@ -726,7 +739,7 @@ def _test_linear(
         make_module,
         uses_bias,
         num_batch_dims=1,
-        quant=False,
+        quant_type=None,
         dtype: torch.dtype = torch.float,
         atol=1e-03,
     ):
@@ -746,6 +759,8 @@ def _test_linear(
         input_sizes = [4, 37, 17]
         output_sizes = [4, 17, 37]
 
+        quant = quant_type is not None
+
         """
         Note that torch.nn.Linear maps to aten.mm.default (no bias) or aten.addmm.default (bias),
         which ares then transformed into aten.linear.default by the ConvertToLinear pass.
@@ -769,7 +784,19 @@ def _test_linear(
             tester = Tester(module, inputs, dynamic_shapes=dynamic_shape)
 
             if quant:
-                tester.quantize()
+                if quant_type == "per_channel":
+                    quant_config = get_symmetric_quantization_config(
+                        is_per_channel=True,
+                        is_dynamic=False,
+                    )
+                elif quant_type == "per_tensor":
+                    quant_config = get_symmetric_quantization_config(
+                        is_per_channel=False,
+                        is_dynamic=False,
+                    )
+                else:
+                    raise ValueError(f"Unsupported quant type {quant_type}")
+                tester.quantize(Quantize(quantization_config=quant_config))
 
             tester.export()
             tester.check_count({aten_op: 1})

From 0563aa7a58cf7123c3e6b590c7515d6095c64672 Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Thu, 9 May 2024 15:22:23 -0700
Subject: [PATCH 26/62] Disable view_copy elimination for graph outputs (#3565)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3565

If the `view_copy` op is a graph output, leave it as a view_copy for now since the output pointer may be modified at runtime when deploying on device.

Right now, the modified pointer would be ignored since the view_copy op will always point to its predecessor memory.

cc chrismthompson jcoriell fengwang

Reviewed By: JacobSzwejbka, metascroy

Differential Revision: D57132664

fbshipit-source-id: b97bd81166b728c306fae8b212aeb5e38348b391
---
 .../replace_view_copy_with_view_pass.py       | 12 +++--
 exir/tests/test_passes.py                     |  7 +--
 exir/tests/test_remove_view_copy.py           | 48 ++++++++++++-------
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/exir/passes/replace_view_copy_with_view_pass.py b/exir/passes/replace_view_copy_with_view_pass.py
index a9304f3eec8..8d3a2a32126 100644
--- a/exir/passes/replace_view_copy_with_view_pass.py
+++ b/exir/passes/replace_view_copy_with_view_pass.py
@@ -273,7 +273,9 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             if not isinstance(module, torch.fx.GraphModule):
                 continue
             for node in module.graph.nodes:
-                if _is_view_copy(node):
+                # Note: We only replace view_copy nodes that are not output, since
+                # the output pointer could be modified at runtime (T187925929)
+                if _is_view_copy(node) and node.next.op != "output":
                     base, _ = node.args
                     node.target = _VIEW_OP
 
@@ -298,7 +300,9 @@ def ensures(self, graph_module: torch.fx.GraphModule) -> None:
             if not isinstance(module, torch.fx.GraphModule):
                 continue
             for node in module.graph.nodes:
-                assert not _is_view_copy(node)
+                # Note: We only replace view_copy nodes that are not output, since
+                # the output pointer could be modified at runtime (T187925929)
+                assert not (_is_view_copy(node) and node.next.op != "output")
                 if node.op == "call_function" and node.target == _VIEW_OP:
                     assert isinstance(node.meta["spec"], _ViewSpec)
 
@@ -311,6 +315,8 @@ def requires(self, graph_module: torch.fx.GraphModule) -> None:
             if not isinstance(module, torch.fx.GraphModule):
                 continue
             for node in module.graph.nodes:
-                if _is_view_copy(node):
+                # Note: We only replace view_copy nodes that are not output, since
+                # the output pointer could be modified at runtime (T187925929)
+                if _is_view_copy(node) and node.next.op != "output":
                     base, size = node.args
                     assert not _is_view_copy(base)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 9c5e4b59adc..0377f70a150 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1630,10 +1630,11 @@ def forward(self, x):
         assert gm_res is not None
         gm = gm_res.graph_module
 
-        # Check before transformation
+        # Check after transformation
+        # Note: one view copy is not replaced, because it's the output of the graph
         FileCheck().check_count(
-            "torch.ops.aten.view_copy.default", 0, exactly=True
+            "torch.ops.aten.view_copy.default", 1, exactly=True
         ).run(gm.code)
-        FileCheck().check_count("executorch_exir_memory_view", 2, exactly=True).run(
+        FileCheck().check_count("executorch_exir_memory_view", 1, exactly=True).run(
             gm.code
         )
diff --git a/exir/tests/test_remove_view_copy.py b/exir/tests/test_remove_view_copy.py
index 0c5b61f8d8f..b3ad1f7d5a7 100644
--- a/exir/tests/test_remove_view_copy.py
+++ b/exir/tests/test_remove_view_copy.py
@@ -19,6 +19,8 @@ def __init__(self):
         super().__init__()
         self.parameter = nn.Parameter(torch.rand(5, 6))
         self.parameter.requires_grad = False
+        self.parameter2 = nn.Parameter(torch.rand(30))
+        self.parameter2.requires_grad = False
 
     def forward(self, x):
         v1 = self.parameter.view(
@@ -28,7 +30,9 @@ def forward(self, x):
         v3 = torch.ops.aten.mul.Tensor(v1, v2).view(
             30
         )  # removed, lifetime of mul.Tensor will be extended
-        return v3
+        v4 = torch.ops.aten.mul.Tensor(v3, self.parameter2)
+        v5 = v4.view(6, 5)  # not removed, output of the graph
+        return v5
 
     def get_example_inputs(self):
         return (torch.rand(5, 6),)
@@ -106,20 +110,25 @@ def test_spec(self) -> None:
         # etpm.exported_program().graph.print_tabular()
 
         # idx  opcode         name                      target                              args                                                kwargs
-        # ---  -------------  ------------------------  ----------------------------------  --------------------------------------------------  --------------
+        # ---  -------------  ------------------------  ----------------------------------  --------------------------------------------------  ----------------
         # 0    placeholder    p_parameter               p_parameter                         ()                                                  {}
-        # 1    placeholder    x                         x                                   ()                                                  {}
-        # 2    call_function  aten_view_copy_default    <function view at 0x7facb871fb50>   (p_parameter, [6, 5])                               {}
-        # 3    call_function  aten_view_copy_default_1  <function view at 0x7facb871fb50>   (x, [6, 5])                                         {}
-        # 4    call_function  alloc                     <function alloc at 0x7facb871fa30>  (((6, 5), torch.float32),)                          {}
-        # 5    call_function  aten_mul_tensor           aten.mul.out                        (aten_view_copy_default, aten_view_copy_default_1)  {'out': alloc}
-        # 6    call_function  aten_view_copy_default_2  <function view at 0x7facb871fb50>   (aten_mul_tensor, [30])                             {}
-        # 7    output         output_1                  output                              ((aten_view_copy_default_2,),)                      {}
+        # 1    placeholder    p_parameter2              p_parameter2                        ()                                                  {}
+        # 2    placeholder    x                         x                                   ()                                                  {}
+        # 3    call_function  aten_view_copy_default    <function view at 0x7fe57bea6d40>   (p_parameter, [6, 5])                               {}
+        # 4    call_function  aten_view_copy_default_1  <function view at 0x7fe57bea6d40>   (x, [6, 5])                                         {}
+        # 5    call_function  alloc                     <function alloc at 0x7fe57bea6c20>  (((6, 5), torch.float32),)                          {}
+        # 6    call_function  aten_mul_tensor           aten.mul.out                        (aten_view_copy_default, aten_view_copy_default_1)  {'out': alloc}
+        # 7    call_function  aten_view_copy_default_2  <function view at 0x7fe57bea6d40>   (aten_mul_tensor, [30])                             {}
+        # 8    call_function  alloc_1                   <function alloc at 0x7fe57bea6c20>  (((30,), torch.float32),)                           {}
+        # 9    call_function  aten_mul_tensor_1         aten.mul.out                        (aten_view_copy_default_2, p_parameter2)            {'out': alloc_1}
+        # 10   call_function  alloc_2                   <function alloc at 0x7fe57bea6c20>  (((6, 5), torch.float32),)                          {}
+        # 11   call_function  aten_view_copy_default_3  aten.view_copy.out                  (aten_mul_tensor_1, [6, 5])                         {'out': alloc_2}
+        # 12   output         output_1                  output                              ((aten_view_copy_default_3,),)                      {}
 
         for node in etpm.exported_program().graph.nodes:
             if node.name == "p_parameter":
-                # p_parameter's lifetime is extended through aten_view_copy_default (memory.view) to idx 5
-                self.assertEqual(node.meta["spec"].lifetime, [0, 5])
+                # p_parameter's lifetime is extended through aten_view_copy_default (memory.view) to idx 6
+                self.assertEqual(node.meta["spec"].lifetime, [0, 6])
             elif node.name == "aten_view_copy_default":
                 # aten_view_copy_default is a memory.view of p_parameter.
                 # p_parameter is a constant with storage, so we check that the view's storage matches the base
@@ -149,8 +158,8 @@ def test_spec(self) -> None:
                     node.meta["spec"].lifetime, node.args[0].meta["spec"].lifetime
                 )
             elif node.name == "aten_mul_tensor":
-                # aten_mul_tensor's lifetime is extended through aten_view_copy_default_2 (memory.view) to idx 7
-                self.assertEqual(node.meta["spec"].lifetime, [4, 7])
+                # aten_mul_tensor's lifetime is extended through aten_view_copy_default_2 (memory.view) to idx 9
+                self.assertEqual(node.meta["spec"].lifetime, [5, 9])
             elif node.name == "aten_view_copy_default_2":
                 # aten_view_copy_default_2 is a memory.view of aten_mul_tensor
 
@@ -184,9 +193,10 @@ def test_spec(self) -> None:
         plan = etpm.executorch_program.execution_plan[0]
         self.assertEqual(plan.operators[0].name, "executorch_prim::et_view")
         self.assertEqual(plan.operators[1].name, "aten::mul")
+        self.assertEqual(plan.operators[2].name, "aten::view_copy")
 
         instructions = plan.chains[0].instructions
-        self.assertEqual(len(instructions), 4)
+        self.assertEqual(len(instructions), 6)
 
         self.assertEqual(
             instructions[0].instr_args.op_index, 0  # pyre-ignore
@@ -196,7 +206,13 @@ def test_spec(self) -> None:
         )  # view @ idx3
         self.assertEqual(
             instructions[2].instr_args.op_index, 1  # pyre-ignore
-        )  # aten:mul @ idx5
+        )  # aten:mul @ idx6
         self.assertEqual(
             instructions[3].instr_args.op_index, 0  # pyre-ignore
-        )  # view @ idx6
+        )  # view @ idx7
+        self.assertEqual(
+            instructions[4].instr_args.op_index, 1  # pyre-ignore
+        )  # aten:mul @ idx9
+        self.assertEqual(
+            instructions[5].instr_args.op_index, 2  # pyre-ignore
+        )  # aten:view_copy @ idx11

From 749f6ab419592cfdc2f2e9938d11c9a4b0b120e5 Mon Sep 17 00:00:00 2001
From: Carlos Fernandez <carlosfsanz@meta.com>
Date: Thu, 9 May 2024 15:27:29 -0700
Subject: [PATCH 27/62] Rust: Update sysinfo crate (#3520)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3520

I need an update sysinfo for some Windows stuff, the previous version doesn't seem to work correctly (plus the documented API has changed a bit and the examples just don't build).

Reviewed By: JakobDegen

Differential Revision: D56913751

fbshipit-source-id: 30ba14269792ad46236929ba40071974dd1ce436
---
 shim/third-party/rust/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shim/third-party/rust/Cargo.toml b/shim/third-party/rust/Cargo.toml
index b2de7475bae..88ba5dda90f 100644
--- a/shim/third-party/rust/Cargo.toml
+++ b/shim/third-party/rust/Cargo.toml
@@ -184,7 +184,7 @@ syn1 = { package = "syn", version = "1.0.109", features = ["extra-traits", "fold
 synstructure = "0.12"
 sync_wrapper = "0.1.0"
 sys-info = "0.9.1"
-sysinfo = "0.26.8"
+sysinfo = "0.30.11"
 take_mut = "0.2.2"
 tar = "0.4.38"
 tempfile = "3.1.0"

From c83af25b76254e96ad0e67ddb0593ad8d039dc66 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 10 May 2024 10:12:06 -0700
Subject: [PATCH 28/62] Add target_link_options_shared_lib to coremldelegate
 build (#3556)

Summary:
Backends use a static initializer to register themselves. We have an established solution to forcing the Apple linker to load the object files containing said initializer, so let's use it for CoreML.

Pull Request resolved: https://github.com/pytorch/executorch/pull/3556

Test Plan: Attempt to load a CoreML PTE from Python no longer fails with error about the backend not being registered

Reviewed By: mikekgfb

Differential Revision: D57136490

Pulled By: swolchok

fbshipit-source-id: 613d7f786fa47f34a94ee4eea7b2a81ef670a573
---
 backends/apple/coreml/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index 0f56e3bdf7e..c9a15ed935e 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -155,6 +155,8 @@ target_link_libraries(
           ${FOUNDATION_FRAMEWORK} ${SQLITE_LIBRARY}
 )
 
+target_link_options_shared_lib(coremldelegate)
+
 if(COREML_BUILD_EXECUTOR_RUNNER)
   target_link_libraries(
     coremldelegate PRIVATE portable_ops_lib portable_kernels

From 8e2e2e21470349e9a1b1c5e058370ae53ac48ff3 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@meta.com>
Date: Fri, 10 May 2024 11:14:20 -0700
Subject: [PATCH 29/62] explicitly import _export.exported_program. (#3574)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3574

We will still merge two serialization, but for now just fix the missing imports.

Reviewed By: tugsbayasgalan

Differential Revision: D57219425

fbshipit-source-id: 21ca361c5041872e90fa779b6b75027c7e3585b8
---
 exir/serde/export_serialize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
index 87691dfbee2..13590b78dde 100644
--- a/exir/serde/export_serialize.py
+++ b/exir/serde/export_serialize.py
@@ -22,6 +22,7 @@
 import sympy
 
 import torch
+import torch._export.exported_program
 import torch.export.exported_program as ep
 
 from torch._export.serde.schema import (

From 1871ec1d06dbabdc8c7a86d7d4d76983c98c4cc8 Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Fri, 10 May 2024 12:07:48 -0700
Subject: [PATCH 30/62] min pip version (#3526)

Summary:
see [T188128067](https://www.internalfb.com/intern/tasks/?t=188128067)
and https://github.com/pytorch/torchchat/issues/726

Pull Request resolved: https://github.com/pytorch/executorch/pull/3526

Reviewed By: huydhn

Differential Revision: D57032452

Pulled By: lucylq

fbshipit-source-id: 9770d5fea83b551518e8b14579f4e40baef85195
---
 build/packaging/pre_build_script.sh | 1 +
 install_requirements.sh             | 1 +
 pyproject.toml                      | 1 +
 3 files changed, 3 insertions(+)

diff --git a/build/packaging/pre_build_script.sh b/build/packaging/pre_build_script.sh
index 1abb1a76fe3..74c98406d05 100644
--- a/build/packaging/pre_build_script.sh
+++ b/build/packaging/pre_build_script.sh
@@ -16,6 +16,7 @@ set -euxo pipefail
 readonly BUILD_DEPS=(
   # This list must match the build-system.requires list from pyproject.toml.
   "cmake"
+  "pip>=23"
   "pyyaml"
   "setuptools>=63"
   "tomli"
diff --git a/install_requirements.sh b/install_requirements.sh
index d88eb505a6c..24a01cae9b6 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -73,6 +73,7 @@ EXIR_REQUIREMENTS=(
 # pip packages needed for development.
 DEVEL_REQUIREMENTS=(
   cmake  # For building binary targets.
+  "pip>=23" # For building the pip package.
   pyyaml  # Imported by the kernel codegen tools.
   "setuptools>=63"  # For building the pip package.
   tomli  # Imported by extract_sources.py when using python < 3.11.
diff --git a/pyproject.toml b/pyproject.toml
index 099cdd0d32c..b6926a2f5f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,7 @@
 [build-system]
 requires = [
   "cmake",  # For building binary targets in the wheel.
+  "pip>=23",  # For building the pip package.
   "pyyaml",  # Imported by the kernel codegen tools.
   "setuptools>=63",  # For building the pip package contents.
   "tomli",  # Imported by extract_sources.py when using python < 3.11.

From b93b7ae4ad00ac15ab5ade347fa0d2ce5756e32e Mon Sep 17 00:00:00 2001
From: Riley Dulin <dulinr@meta.com>
Date: Fri, 10 May 2024 13:05:32 -0700
Subject: [PATCH 31/62] Change custom_skip_targets meaning for
 constant_prop_pass (#3491)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3491

Some users of `constant_prop_pass` want to fold across calls to
`full`, because representing a tensor as a program constant is a requirement
for some backends.
This came up when writing some tests using `torch.ones` as a weight tensor,
which is represented as `aten.full` in Edge Dialect.

When the user specifies a custom skip set, do *not* add the default `aten.full`
function, in case the user doesn't want it.

Reviewed By: angelayi

Differential Revision: D56894215

fbshipit-source-id: 24b7f570ce41576650c457586fc5540371889121
---
 exir/passes/constant_prop_pass.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 0fabf223fb8..96c40e65363 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -112,11 +112,11 @@ def get_propagated_const_tensor_dict(
     # Initialize dict with all constant placeholders.
     const_node_to_tensor = get_constant_placeholder_dict(exported_program)
 
-    all_skip_targets: set[EdgeOpOverload] = set()
-    # Default set of targets to skip.
-    all_skip_targets.update(_DEFAULT_SKIP_TARGETS)
     if custom_skip_targets is not None:
-        all_skip_targets.update(custom_skip_targets)
+        all_skip_targets = custom_skip_targets
+    else:
+        # Default set of targets to skip.
+        all_skip_targets = _DEFAULT_SKIP_TARGETS
 
     for node in exported_program.graph.nodes:
         if node.op != "call_function" or node.target in all_skip_targets:

From 60fa0d3a78246185b0b08974e4b6b50e56d6845d Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Fri, 10 May 2024 14:20:09 -0700
Subject: [PATCH 32/62] build and upload aar (#3381)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/3381

Reviewed By: huydhn

Differential Revision: D56653825

Pulled By: kirklandsign

fbshipit-source-id: 06c903754a1628f9eaab72c7d21d2e8b6601eadc
---
 .ci/docker/common/install_base.sh |  3 ++-
 .github/workflows/android.yml     |  3 +++
 build/test_android_ci.sh          | 18 ++++++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
index fec0e057ba1..cbca22cfa33 100755
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@@ -23,7 +23,8 @@ install_ubuntu() {
     unzip \
     gdb \
     rsync \
-    libssl-dev
+    libssl-dev \
+    zip
 
   # Cleanup package manager
   apt-get autoclean && apt-get clean
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 324440dd745..051c9b22a78 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -61,6 +61,9 @@ jobs:
         cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/arm64-v8a/
         cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/x86_64/
         cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/x86_64/
+        # Copyp AAR to S3
+        cp build_aar/executorch.aar artifacts-to-be-uploaded/
+        cp build_aar/executorch-llama.aar artifacts-to-be-uploaded/
 
   # Upload the app and its test suite to S3 so that they can be downloaded by the test job
   upload-artifacts:
diff --git a/build/test_android_ci.sh b/build/test_android_ci.sh
index 8d9391146dc..b1f17730f5d 100755
--- a/build/test_android_ci.sh
+++ b/build/test_android_ci.sh
@@ -22,6 +22,7 @@ build_android_native_library() {
   pushd examples/demo-apps/android/LlamaDemo
   CMAKE_OUT="cmake-out-android-$1" ANDROID_NDK=/opt/ndk ANDROID_ABI="$1" ./gradlew setup
   popd
+  cp "cmake-out-android-$1"/extension/android/*.so build_aar/jni/$1/
 }
 
 build_android_demo_app() {
@@ -37,8 +38,25 @@ build_android_llama_demo_app() {
   popd
 }
 
+build_aar() {
+  cp extension/android/build/libs/executorch.jar build_aar/libs
+  echo \<manifest xmlns:android=\"http://schemas.android.com/apk/res/android\" \
+   package=\"org.pytorch.executorch\"\> \
+   \<uses-sdk android:minSdkVersion=\"19\" /\> \
+   \</manifest\> > build_aar/AndroidManifest.xml
+  pushd build_aar
+  zip -r executorch.aar libs jni AndroidManifest.xml
+
+  rm jni/arm64-v8a/libexecutorch_jni.so jni/x86_64/libexecutorch_jni.so
+  zip -r executorch-llama.aar libs jni AndroidManifest.xml
+  popd
+}
+
+mkdir -p build_aar/jni/arm64-v8a build_aar/jni/x86_64 build_aar/libs
+
 build_android_native_library arm64-v8a
 build_android_native_library x86_64
 export_model
 build_android_demo_app
 build_android_llama_demo_app
+build_aar

From 43bfcd2b390300f428347a539a7a5938a1345e34 Mon Sep 17 00:00:00 2001
From: Justin Yip <yipjustin@meta.com>
Date: Fri, 10 May 2024 15:26:47 -0700
Subject: [PATCH 33/62] Batch norm (#3569)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3569

as title.
ghstack-source-id: 225969399

Reviewed By: copyrightly

Differential Revision: D57198172

fbshipit-source-id: 853944b2dace7294235e89b6959f3a2d72f6a322
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     |   7 +-
 .../runtime/graph/ops/glsl/batchnorm.glsl     |  55 +++++++++
 .../runtime/graph/ops/glsl/batchnorm.yaml     |  10 ++
 .../runtime/graph/ops/impl/BatchNorm.cpp      | 110 ++++++++++++++++++
 backends/vulkan/test/op_tests/cases.py        |  77 ++++++++++++
 .../vulkan/test/op_tests/utils/codegen.py     |   9 +-
 6 files changed, 266 insertions(+), 2 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 2e8f4c007d0..cc91e6b2dc6 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -280,7 +280,12 @@ ValueRef ComputeGraph::set_output_tensor(
     api::ScalarType dtype = get_tensor(idx)->dtype();
     size_t gpu_numel = get_tensor(idx)->gpu_numel();
     ValueRef staging_idx = add_staging(dtype, gpu_numel);
-    add_tensor_to_staging_node(*this, idx, staging_idx);
+    // We only run this when the tensor is non-empty.  When the underlying
+    // tensor is empty (e.g. gpu_numel == 0), we do not allocate a VkImage to
+    // tensor, we will not be able to bind the node for execution.
+    if (gpu_numel > 0) {
+      add_tensor_to_staging_node(*this, idx, staging_idx);
+    }
     outputs_.push_back({idx, staging_idx});
     return staging_idx;
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
new file mode 100644
index 00000000000..deb03192af0
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D weight_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
+layout(set = 0, binding = 4) uniform PRECISION sampler3D mean_in;
+layout(set = 0, binding = 5) uniform PRECISION sampler3D var_in;
+
+layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Params {
+  float eps;
+};
+
+layout(set = 0, binding = 8) uniform PRECISION restrict Params2 {
+  int num_texel_per_batch;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  VEC4_T v = VEC4_T(texelFetch(image_in, pos, 0));
+
+  ivec3 param_pos = ivec3(pos.z % num_texel_per_batch, 0, 0);
+
+  VEC4_T weight = VEC4_T(texelFetch(weight_in, param_pos, 0));
+  VEC4_T bias = VEC4_T(texelFetch(bias_in, param_pos, 0));
+  VEC4_T mean = VEC4_T(texelFetch(mean_in, param_pos, 0));
+  VEC4_T var = VEC4_T(texelFetch(var_in, param_pos, 0));
+
+  v = ((v - mean) / sqrt(var + eps)) * weight + bias;
+
+  imageStore(image_out, pos, v);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
new file mode 100644
index 00000000000..a92e44f636b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
@@ -0,0 +1,10 @@
+batchnorm:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: batchnorm
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
new file mode 100644
index 00000000000..7ea541aab46
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+ValueRef prepack_arg(
+    ComputeGraph& graph,
+    ValueRef arg_ref,
+    int64_t num_channels,
+    const std::string& debug_name) {
+  VK_CHECK_COND(
+      graph.val_is_tref(arg_ref),
+      "native_batch_norm requires ",
+      debug_name,
+      " to be a constant tensorref");
+  VK_CHECK_COND(graph.get_tref(arg_ref)->sizes[0] == num_channels);
+
+  // batch_norm's param are broadcasted on the channel dimension.
+  // In this implementation, we pack the weights along the x dimension, and
+  // in the shader, we lookup using the along the x.
+  return prepack_if_tensor_ref(graph, arg_ref, api::kWidthPacked);
+}
+
+void add_native_batch_norm_node(
+    ComputeGraph& graph,
+    ValueRef in_ref,
+    ValueRef weight_ref,
+    ValueRef bias_ref,
+    ValueRef mean_ref,
+    ValueRef var_ref,
+    ValueRef eps_ref,
+    ValueRef out_tuple_ref) {
+  std::vector<int64_t> in_sizes = graph.get_tensor(in_ref)->sizes();
+  std::vector<int64_t> out_sizes = graph.get_tensor(in_ref)->sizes();
+
+  VK_CHECK_COND(in_sizes.size() == 4, "BatchNorm only support 4d tensor");
+  VK_CHECK_COND(out_sizes.size() == 4, "BatchNorm only support 4d tensor");
+
+  int64_t num_channels = dim_at<kChannel4D>(in_sizes);
+
+  ValueRef arg_weight = prepack_arg(graph, weight_ref, num_channels, "weight");
+  ValueRef arg_bias = prepack_arg(graph, bias_ref, num_channels, "bias");
+  ValueRef arg_mean = prepack_arg(graph, mean_ref, num_channels, "mean");
+  ValueRef arg_var = prepack_arg(graph, var_ref, num_channels, "var");
+  float epsilon = graph.extract_scalar<float>(eps_ref);
+
+  vTensorPtr t_in = graph.get_tensor(in_ref);
+
+  // Only the first element of the return value is propagated. The remaining 2
+  // elements are zero-size dummy tensor.
+  const auto out_tuple_val = graph.get_value_list(out_tuple_ref);
+
+  ValueRef out_ref = out_tuple_val->at(0);
+
+  VK_CHECK_COND(!graph.val_is_tref(out_ref), "Output should not be tref");
+  vTensorPtr t_out = graph.get_tensor(out_ref);
+
+  VK_CHECK_COND(
+      dim_at<kChannel4D>(t_out->sizes()) == num_channels,
+      "out channel must match in channel");
+
+  std::string kernel_name = "batchnorm";
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  int32_t num_texel_per_batch =
+      api::utils::div_up((dim_at<kChannel4D>(t_in->sizes())), 4);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      {{out_ref, api::MemoryAccessType::WRITE},
+       {{in_ref, arg_weight, arg_bias, arg_mean, arg_var},
+        api::MemoryAccessType::READ}},
+      {t_out->texture_limits_ubo(),
+       graph.create_params_buffer(epsilon),
+       graph.create_params_buffer(num_texel_per_batch)}));
+}
+
+void native_batch_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[5] is momentum. It is not used in the calculation.
+  return add_native_batch_norm_node(
+      graph, args[0], args[1], args[2], args[3], args[4], args[6], args[7]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(
+      aten._native_batch_norm_legit_no_training.default, native_batch_norm);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 8e7fbab6636..9f47284485b 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -651,6 +651,82 @@ def get_unary_ops_inputs():
     return test_suite
 
 
+def get_native_batch_norm_inputs():
+    Test = namedtuple(
+        "VkSliceTest", ["self", "weight", "bias", "mean", "var", "momentum", "eps"]
+    )
+
+    test_cases = [
+        Test(
+            self=(1, 1, 2, 5),
+            weight=(1,),
+            bias=(1,),
+            mean=(1,),
+            var=(1,),
+            momentum=0.0,
+            eps=0.001,
+        ),
+        Test(
+            self=(S2, 1, 2, 5),
+            weight=(1,),
+            bias=(1,),
+            mean=(1,),
+            var=(1,),
+            momentum=0.0,
+            eps=0.001,
+        ),
+        Test(
+            self=(1, S2, 2, 5),
+            weight=(S2,),
+            bias=(S2,),
+            mean=(S2,),
+            var=(S2,),
+            momentum=0.0,
+            eps=0.001,
+        ),
+        Test(
+            self=(9, S1, 2, 5),
+            weight=(S1,),
+            bias=(S1,),
+            mean=(S1,),
+            var=(S1,),
+            momentum=0.0,
+            eps=0.01,
+        ),
+        Test(
+            self=(3, S1, 2, 5),
+            weight=(S1,),
+            bias=(S1,),
+            mean=(S1,),
+            var=(S1,),
+            momentum=0.0,
+            eps=0.001,
+        ),
+        Test(
+            self=(3, S2, 2, 5),
+            weight=(S2,),
+            bias=(S2,),
+            mean=(S2,),
+            var=(S2,),
+            momentum=0.0,
+            eps=0.001,
+        ),
+        Test(
+            self=(3, S2, 2, 5),
+            weight=(S2,),
+            bias=(S2,),
+            mean=(S2,),
+            var=(S2,),
+            momentum=0.0,
+            eps=0.000,
+        ),
+    ]
+
+    test_suite = VkTestSuite(test_cases)
+
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -678,4 +754,5 @@ def get_unary_ops_inputs():
     "aten.sqrt.default": get_unary_ops_inputs(),
     "aten._softmax.default": get_softmax_inputs(),
     "aten._log_softmax.default": get_softmax_inputs(),
+    "aten._native_batch_norm_legit_no_training.default": get_native_batch_norm_inputs(),
 }
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
index 7cfe71eee7f..fa02986a1cd 100644
--- a/backends/vulkan/test/op_tests/utils/codegen.py
+++ b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -130,7 +130,14 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
                 ATenArg(name=arg.name, cpp_type=cpp_type, default=arg.default)
             )
 
-            requires_prepack = "weight" in arg.name or "bias" in arg.name
+            # These are the argument will be passed as a "weight" tensor, the
+            # corresponding object will be TensorRef in the compute graph.
+            requires_prepack = (
+                "weight" in arg.name
+                or "bias" in arg.name
+                or "running_mean" in arg.name
+                or "running_var" in arg.name
+            )
             supports_prepack = False
             if arg.name in self.suite_def.prepacked_args:
                 supports_prepack = True

From 60c94e81bc0ea31f63977da57a47e2791b909e9d Mon Sep 17 00:00:00 2001
From: Wei Lu <luwei@meta.com>
Date: Fri, 10 May 2024 18:58:03 -0700
Subject: [PATCH 34/62] gelu (#3573)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3573

- implemented `gelu` op, only `approximate="tanh"` is supported for now
- added string data type in the codegen
- added `extract_string` in `ComputeGraph.h`

Reviewed By: yipjustin, jorgep31415

Differential Revision: D57194536

fbshipit-source-id: 4c6c2e126fe35021248759ad4578d8f6aec9bffc
---
 backends/vulkan/partitioner/vulkan_partitioner.py   |  3 ++-
 backends/vulkan/runtime/graph/ComputeGraph.h        |  4 ++++
 .../vulkan/runtime/graph/ops/glsl/unary_op.yaml     |  6 ++++--
 backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp  | 13 +++++++++++--
 backends/vulkan/test/op_tests/cases.py              | 13 +++++++++++++
 backends/vulkan/test/op_tests/utils/codegen.py      |  3 +++
 backends/vulkan/test/op_tests/utils/codegen_base.py |  3 +++
 backends/vulkan/test/test_vulkan_delegate.py        | 11 +++++++++++
 8 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 93e703dc030..86dfb74d069 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -38,11 +38,12 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             # Unary operators
             exir_ops.edge.aten.abs.default,
             exir_ops.edge.aten.clamp.default,
+            exir_ops.edge.aten.gelu.default,
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.relu.default,
             exir_ops.edge.aten.sigmoid.default,
-            exir_ops.edge.aten.tanh.default,
             exir_ops.edge.aten.sqrt.default,
+            exir_ops.edge.aten.tanh.default,
             # Matrix multiplication operators
             exir_ops.edge.aten.bmm.default,
             exir_ops.edge.aten.mm.default,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 00d8cbd3c55..fbb49f47998 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -196,6 +196,10 @@ class ComputeGraph final {
     }
   }
 
+  std::string extract_string(const ValueRef idx) {
+    return values_.at(idx).toString();
+  }
+
   inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
     return prepack_nodes_;
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
index 2d8ec36d9a8..14e4e111a2d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
@@ -12,9 +12,11 @@ unary_op:
       OPERATOR: abs(X)
     - NAME: clamp
       OPERATOR: clamp(X, A, B)
+    - NAME: gelu
+      OPERATOR: 0.5 * X * (1 + tanh(sqrt(2 / 3.141593) * (X + 0.044715 * X * X * X)))
     - NAME: sigmoid
       OPERATOR: 1 / (1 + exp(-1 * X))
-    - NAME: tanh
-      OPERATOR: tanh(clamp(X, -15.0, 15.0))
     - NAME: sqrt
       OPERATOR: sqrt(X)
+    - NAME: tanh
+      OPERATOR: tanh(clamp(X, -15.0, 15.0))
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index b2fb1135d77..4dd615cda18 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -100,10 +100,18 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
         kClampShaderName);                                               \
   }
 
+void gelu(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[1] is the `approximate` string
+  // https://fburl.com/code/9omngmyo
+  // currently only `approximate = "tanh"` is supported
+  return add_unary_op_node(
+      graph, args[0], kDummyFloat, kDummyFloat, args[2], "gelu");
+}
+
 DEFINE_ACTIVATION_FN(abs);
 DEFINE_ACTIVATION_FN(sigmoid);
-DEFINE_ACTIVATION_FN(tanh);
 DEFINE_ACTIVATION_FN(sqrt);
+DEFINE_ACTIVATION_FN(tanh);
 DEFINE_CLAMP_FN(clamp);
 DEFINE_CLAMP_FN(hardtanh);
 DEFINE_RELU_FN(relu);
@@ -111,11 +119,12 @@ DEFINE_RELU_FN(relu);
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.abs.default, abs);
   VK_REGISTER_OP(aten.clamp.default, clamp);
+  VK_REGISTER_OP(aten.gelu.default, gelu);
   VK_REGISTER_OP(aten.hardtanh.default, hardtanh);
   VK_REGISTER_OP(aten.relu.default, relu);
   VK_REGISTER_OP(aten.sigmoid.default, sigmoid);
-  VK_REGISTER_OP(aten.tanh.default, tanh);
   VK_REGISTER_OP(aten.sqrt.default, sqrt);
+  VK_REGISTER_OP(aten.tanh.default, tanh);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 9f47284485b..a1e6227a227 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -727,6 +727,18 @@ def get_native_batch_norm_inputs():
     return test_suite
 
 
+def get_gelu_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((M1), "tanh"),
+            ((M1, M2), "tanh"),
+            ((S1, M1, M2), "tanh"),
+            ((S1, S2, S2, M2), "tanh"),
+        ]
+    )
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -755,4 +767,5 @@ def get_native_batch_norm_inputs():
     "aten._softmax.default": get_softmax_inputs(),
     "aten._log_softmax.default": get_softmax_inputs(),
     "aten._native_batch_norm_legit_no_training.default": get_native_batch_norm_inputs(),
+    "aten.gelu.default": get_gelu_inputs(),
 }
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
index fa02986a1cd..c803f767920 100644
--- a/backends/vulkan/test/op_tests/utils/codegen.py
+++ b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -24,6 +24,7 @@
     OPT_LAYOUT,
     OPT_MEMORY_FORMAT,
     OPT_SCALAR_TYPE,
+    STRING,
     TENSOR_VECTOR,
     TestSuite,
     TestSuiteGen,
@@ -351,6 +352,8 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             or ref.src_cpp_type == OPT_MEMORY_FORMAT
         ):
             ret_str += "add_none(); \n"
+        elif ref.src_cpp_type == STRING:
+            ret_str += f"add_string(std::string({ref.src_cpp_name})); \n"
         elif ref.src_cpp_type == TWO_TENSOR_TUPLE:
             ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second}}); \n"
         elif ref.src_cpp_type == THREE_TENSOR_TUPLE:
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
index 6dac97583c6..c1c6249e27f 100644
--- a/backends/vulkan/test/op_tests/utils/codegen_base.py
+++ b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -29,6 +29,7 @@
 OPT_LAYOUT = "::std::optional<at::Layout>"
 OPT_MEMORY_FORMAT = "::std::optional<at::MemoryFormat>"
 OPT_SCALAR_TYPE = "::std::optional<at::ScalarType>"
+STRING = "c10::string_view"
 TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
 THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
 TENSOR_VECTOR = "::std::vector<at::Tensor>"
@@ -166,6 +167,8 @@ def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
                 ret_str += "std::nullopt;"
             else:
                 ret_str += f"{str(data)};"
+        elif cpp_type == STRING:
+            ret_str += f'c10::string_view("{data}");'
         elif (
             cpp_type == OPT_SCALAR_TYPE
             or cpp_type == OPT_LAYOUT
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 531f1d28a92..2cd3bc3a270 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1034,3 +1034,14 @@ def forward(self, x):
             sample_inputs,
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_gelu(self):
+        class GeluModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.gelu = torch.nn.GELU(approximate="tanh")
+
+            def forward(self, x):
+                return self.gelu(x)
+
+        self.lower_unary_module_and_test_output(GeluModule())

From 629b1127f3f501485803f1bb29988b01834a1579 Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Mon, 13 May 2024 07:02:56 -0700
Subject: [PATCH 35/62] Cadence - refactor quantizer and passes (#3539)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3539

Update the quantizer to the latest internal version, and heavily refactor it for readability and maintenance.

Reviewed By: cccclai

Differential Revision: D57082021

fbshipit-source-id: 947753859f6ffc5ea9ea0e803d2c4a0fd30bf608
---
 backends/cadence/aot/__init__.py              |   5 +
 backends/cadence/aot/export_example.py        |   8 +-
 backends/cadence/aot/passes.py                |  42 +
 backends/cadence/aot/quantizer.py             | 855 ------------------
 backends/cadence/aot/quantizer/fusion_pass.py | 437 +++++++++
 backends/cadence/aot/quantizer/patterns.py    | 344 +++++++
 backends/cadence/aot/quantizer/quantizer.py   | 145 +++
 backends/cadence/aot/quantizer/utils.py       | 129 +++
 8 files changed, 1106 insertions(+), 859 deletions(-)
 create mode 100644 backends/cadence/aot/passes.py
 delete mode 100644 backends/cadence/aot/quantizer.py
 create mode 100644 backends/cadence/aot/quantizer/fusion_pass.py
 create mode 100644 backends/cadence/aot/quantizer/patterns.py
 create mode 100644 backends/cadence/aot/quantizer/quantizer.py
 create mode 100644 backends/cadence/aot/quantizer/utils.py

diff --git a/backends/cadence/aot/__init__.py b/backends/cadence/aot/__init__.py
index e69de29bb2d..2e41cd717f6 100644
--- a/backends/cadence/aot/__init__.py
+++ b/backends/cadence/aot/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
index e95a39b5a36..bf96de2afdf 100644
--- a/backends/cadence/aot/export_example.py
+++ b/backends/cadence/aot/export_example.py
@@ -14,12 +14,12 @@
 from typing import Any, Tuple
 
 from executorch.backends.cadence.aot.compiler import export_to_edge
-from executorch.backends.cadence.aot.quantizer import (
-    CadenceBaseQuantizer,
-    QuantFusion,
+from executorch.backends.cadence.aot.passes import (
     ReplacePT2DequantWithCadenceDequant,
     ReplacePT2QuantWithCadenceQuant,
 )
+from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
+from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
 from executorch.exir import ExecutorchProgramManager
 from torch import nn
 from torch._export import capture_pre_autograd_graph
@@ -52,7 +52,7 @@ def export_model(
     model: nn.Module, example_inputs: Tuple[Any], file_name: str = "CadenceDemoModel"
 ):
     # Quantizer
-    quantizer = CadenceBaseQuantizer()
+    quantizer = CadenceQuantizer()
 
     # Export
     model_exp = capture_pre_autograd_graph(model, example_inputs)
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
new file mode 100644
index 00000000000..2ced2eaf87a
--- /dev/null
+++ b/backends/cadence/aot/passes.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class ReplacePT2QuantWithCadenceQuant(ExportPass):
+    """
+    Replace the pt2 quantization ops with custom cadence quantization ops.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in {exir_ops.edge.quantized_decomposed.quantize_per_tensor.default}:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args,
+            kwargs,
+            meta,
+        )
+
+
+class ReplacePT2DequantWithCadenceDequant(ExportPass):
+    """
+    Replace the pt2 dequantization ops with custom cadence dequantization ops.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in {exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default}:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+            args,
+            kwargs,
+            meta,
+        )
diff --git a/backends/cadence/aot/quantizer.py b/backends/cadence/aot/quantizer.py
deleted file mode 100644
index df184f9d92c..00000000000
--- a/backends/cadence/aot/quantizer.py
+++ /dev/null
@@ -1,855 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass, field
-from math import frexp, isclose, trunc
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
-
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
-
-from torch import fx
-
-from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
-from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
-from torch.ao.quantization.quantizer import Quantizer
-from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
-from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
-    OperatorConfig,
-    QuantizationAnnotation,
-    QuantizationConfig,
-    QuantizationSpec,
-    SharedQuantizationSpec,
-)
-from torch.fx import GraphModule
-from torch.fx.passes.infra.pass_base import PassResult
-from torch.fx.passes.utils.fuser_utils import legalize_graph
-
-
-def quantize_tensor_multiplier(
-    requantize_scale_tensor: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Given requantize_scale_tensor with values in the interval (0, 1),
-    produce a pair of tensors (out_multiplier, right_shift) where out_multiplier
-    is an int32 tensor representing fixed-point values in the interval [-1, 1),
-    and right_shift is an amount to shift right by, so that the floating-point
-    multiplication of some int32 input with each value of requantize_scale_tensor:
-        result = int32_value * requantize_scale_tensors[i]
-    is best approximated by the integer-arithmetic-only code:
-        result = RoundingRightShift(FixedPointMultiplication(int32_value,
-                                    out_multiplier[i]), right_shift[i])
-    """
-
-    # This is identical to C++11 std::round(). The general python round rounds
-    # down, and C++ rounds away from zero.
-    def round_away_zero(f) -> int:
-        r = -0.5 if (f < 0) else 0.5
-        return trunc(f + r)
-
-    def quantize_scalar_multiplier(requantize_scale: float) -> Tuple[int, int]:
-        significand, exponent = frexp(requantize_scale)
-        significand_q31 = int(round_away_zero(significand * (1 << 31)))
-        # Handle the special case when the real multiplier was so close to 1
-        # that its fixed-point approximation was indistinguishable from 1.
-        # We handle this by dividing it by two, incrementing exponent by 1.
-        # the right shift amount.
-        if significand_q31 == (1 << 31):
-            significand_q31 //= 2
-            exponent += 1
-
-        # Verify that the decomposition of requantize_scale into significand
-        # and exponent is correct.
-        reconstructed = significand_q31 / (1 << 31) * pow(2, exponent)
-        assert isclose(
-            requantize_scale, reconstructed, rel_tol=1e-4, abs_tol=1e-4
-        ), "computation of significand and exponent from requantize_scale is not accurate"
-
-        return (significand_q31, exponent)
-
-    # Flatten the input scale tensor so that we can operate on individual values
-    orig_shape = requantize_scale_tensor.shape
-    flattened_tensor = requantize_scale_tensor.flatten().to(torch.float32)
-    out_multiplier = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
-    right_shift = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
-
-    # Iterate over the flattened scale tensor and compute the decomposition of
-    # each value in scale tensor into significand(out_multiplier) and
-    # exponent(right_shift)
-    for idx, scale in enumerate(flattened_tensor):
-        (si, ex) = quantize_scalar_multiplier(scale)
-        out_multiplier[idx], right_shift[idx] = si, ex
-
-    # Reshape the tensors back to the original shape
-    out_multiplier = out_multiplier.reshape(orig_shape)
-    right_shift = right_shift.reshape(orig_shape)
-
-    return (out_multiplier, right_shift)
-
-
-def _is_annotated(nodes: List[fx.Node]) -> bool:
-    annotated = False
-    for node in nodes:
-        annotated = annotated or (
-            "quantization_annotation" in node.meta
-            and node.meta["quantization_annotation"]._annotated
-        )
-    return annotated
-
-
-def _no_outside_users(fused_partition) -> bool:
-    """
-    Checks if each partition other than the last does not have any outside users.
-    """
-    for source_partition in fused_partition[:-1]:
-        if len(source_partition.output_nodes) != 1:
-            return False
-        if len(source_partition.output_nodes[0].users) != 1:
-            return False
-    return True
-
-
-# Helper function to get the weight node for both quantized and unquantized weights
-# TODO(matthiascremon): get a better test!
-def get_weight_node(weights_inputs: fx.Node, dequants_weights: fx.Node) -> fx.Node:
-    """
-    Returns the weight node.
-    """
-    weight_node = (
-        weights_inputs
-        if weights_inputs.name.endswith("_frozen_param")
-        else dequants_weights
-    )
-    return weight_node
-
-
-# Helper function to get the args and kwargs for the linear replacement op
-def get_args_and_kwargs_linear(
-    graph_module: GraphModule,
-    inputs_inputs: List[fx.Node],
-    dequants_inputs: List[fx.Node],
-    other_inputs: List[fx.Node],
-    weights_inputs: List[fx.Node],
-    dequants_weights: List[fx.Node],
-    bias_inputs: List[fx.Node],
-    quant_node: fx.Node,
-) -> Tuple[Tuple[Any], Dict[str, Any]]:
-    """
-    Returns the args and kwargs for the linear replacement op.
-    """
-    weight_scale = get_weight_node(weights_inputs[0], dequants_weights[0]).args[1]
-    # pyre-fixme[58]: Unsupported operand types
-    bias_scale = dequants_inputs[0].args[1] * weight_scale
-    requantize_scale = bias_scale / quant_node.args[1]
-    requantize_scale_t = torch.tensor([requantize_scale])
-
-    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
-
-    # If bias is not available, create a bias tensor with the shape of weight[0]
-    if not bias_inputs:
-        weight_node = get_weight_node(weights_inputs[0], dequants_weights[0]).args[0]
-        # pyre-fixme[16]: Undefined attribute
-        attr_node = getattr(graph_module, weight_node.target)
-        weight_shape = list(attr_node.shape)
-        bias_shape = weight_shape[0]
-        bias = graph_module.graph.call_function(
-            torch.ops.aten.full.default, ([bias_shape], 0.0)
-        )
-    else:
-        bias = bias_inputs[0]
-
-    bias_int32_quant = graph_module.graph.call_function(
-        torch.ops.quantized_decomposed.quantize_per_tensor.default,
-        (
-            bias,
-            bias_scale,
-            0,
-            -(2**31),
-            (2**31) - 1,
-            torch.int32,
-        ),
-    )
-
-    # Create single element tensors for weight_zero_point, out_multiplier, out_shift.
-    # Note that the function expects int32_t, when it would default to int64_t, so
-    # we explicitly require that type.
-    weight_zero_point_ = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        ([1], dequants_weights[0].args[2]),
-        {"dtype": torch.int32},
-    )
-    out_multiplier_ = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        ([1], out_multiplier[0].item()),
-        {"dtype": torch.int32},
-    )
-    out_shift_ = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        ([1], out_shift[0].item()),
-        {"dtype": torch.int32},
-    )
-
-    args = tuple(inputs_inputs + weights_inputs + other_inputs + [bias_int32_quant])
-    kwargs = {
-        "src_zero_point": dequants_inputs[0].args[2],
-        "weight_zero_point": weight_zero_point_,
-        "out_multiplier": out_multiplier_,
-        "out_shift": out_shift_,
-        "out_zero_point": quant_node.args[2],
-        "offset": None,
-    }
-    return args, kwargs
-
-
-# Helper function to get the args and kwargs for the layer norm replacement op
-def get_args_and_kwargs_layer_norm(
-    graph_module: GraphModule,
-    inputs_inputs: List[fx.Node],
-    dequants_inputs: List[fx.Node],
-    other_inputs: List[fx.Node],
-    weights_init_inputs: List[fx.Node],
-    bias_inputs: List[fx.Node],
-    quant_node: fx.Node,
-) -> Tuple[Tuple[Any], Dict[str, Any]]:
-    """
-    Returns the args and kwargs for the layer norm replacement op.
-    """
-    # Check if the input is per-channel quantized
-    # TODO(matthiascremon): add proper support and testing for per-channel quantization
-    assert isinstance(dequants_inputs[0].args[1], float) and isinstance(
-        dequants_inputs[0].args[2], int
-    ), "per-channel quantization is not supported for layer norm, both scale and zero_point should be scalars"
-
-    # Make the scale and zero_point tensors
-    scale_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        (
-            [1],
-            dequants_inputs[0].args[1],
-        ),
-    )
-    zero_point_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        (
-            [1],
-            dequants_inputs[0].args[2],
-        ),
-    )
-
-    # Make the args and kwargs for the replacement op
-    args = tuple(inputs_inputs + [scale_tensor] + [zero_point_tensor])
-    kwargs = {
-        "normalized_shape": other_inputs[0],
-        "weight": weights_init_inputs[0],
-        "bias": bias_inputs[0],
-        "eps": 1e-05,
-        "output_scale": quant_node.args[1],
-        "output_zero_point": quant_node.args[2],
-    }
-    return args, kwargs
-
-
-def get_conv_args(arg, first_val: int) -> List[fx.Node]:
-    return arg if len(arg) == 2 else [first_val, arg[0]]
-
-
-def get_args_and_kwargs_conv1d(
-    graph_module: GraphModule,
-    inputs_inputs: List[fx.Node],
-    dequants_inputs: List[fx.Node],
-    other_inputs: List[fx.Node],
-    weights_inputs: List[fx.Node],
-    dequants_weights: List[fx.Node],
-    bias_inputs: List[fx.Node],
-    quant_node: fx.Node,
-    op_node: fx.Node,
-):
-    weight_scale = get_weight_node(weights_inputs[0], dequants_weights[0]).args[1]
-    weight_zero_point = get_weight_node(weights_inputs[0], dequants_weights[0]).args[2]
-    # pyre-fixme[58]: Unsupported operand types
-    bias_scale = dequants_inputs[0].args[1] * weight_scale
-    stride = [1, 1] if len(op_node.args) < 4 else get_conv_args(op_node.args[3], 1)
-    padding = [0, 0] if len(op_node.args) < 5 else get_conv_args(op_node.args[4], 0)
-    dilation = [1, 1] if len(op_node.args) < 6 else get_conv_args(op_node.args[5], 1)
-    groups = 1 if len(op_node.args) < 7 else op_node.args[6]
-    # If bias is not available, create a bias tensor with the shape of weight[0]
-    if not bias_inputs:
-        weight_node = get_weight_node(weights_inputs[0], dequants_weights[0]).args[0]
-        # pyre-fixme[16]: Undefined attribute
-        attr_node = getattr(graph_module, weight_node.target)
-        weight_shape = list(attr_node.shape)
-        bias_shape = weight_shape[0]
-        bias = graph_module.graph.call_function(
-            torch.ops.aten.full.default, ([bias_shape], 0.0)
-        )
-    else:
-        bias = bias_inputs[0]
-    # The bias is quantized to int32_t
-    bias_int32_quant = graph_module.graph.call_function(
-        torch.ops.quantized_decomposed.quantize_per_tensor.default,
-        (
-            bias,
-            bias_scale,
-            0,
-            -(2**31),
-            (2**31) - 1,
-            torch.int32,
-        ),
-    )
-
-    # Compute the out multiplier and out shift. They are used when the conv op is
-    # replaced by quantized linear, we compute them a priori for simplicity but
-    # may revisit the decision.
-    requantize_scale = bias_scale / quant_node.args[1]
-    requantize_scale_t = torch.tensor([requantize_scale])
-
-    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
-
-    out_multiplier_ = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        ([1], out_multiplier[0].item()),
-        {"dtype": torch.int32},
-    )
-    out_shift_ = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        ([1], out_shift[0].item()),
-        {"dtype": torch.int32},
-    )
-
-    # Create a single element tensor for the weight zero point
-    weight_zero_point_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        ([1], weight_zero_point),
-        {"dtype": torch.int32},
-    )
-
-    # Create a single element tensor for the bias scale
-    bias_scale_tensor = graph_module.graph.call_function(
-        torch.ops.aten.full.default,
-        ([1], bias_scale),
-        {"dtype": torch.float32},
-    )
-
-    # Make the args and kwargs for the replacement op
-    args = tuple(inputs_inputs + weights_inputs + other_inputs + [bias_int32_quant])
-    kwargs = {
-        "stride": stride,
-        "padding": padding,
-        "dilation": dilation,
-        "groups": groups,
-        "input_zero_point": dequants_inputs[0].args[2],
-        "weight_zero_point": weight_zero_point_tensor,
-        "bias_scale": bias_scale_tensor,
-        "out_scale": quant_node.args[1],
-        "out_zero_point": quant_node.args[2],
-        "out_multiplier": out_multiplier_,
-        "out_shift": out_shift_,
-        "channel_last": False,
-    }
-    return args, kwargs
-
-
-def get_args_and_kwargs_relu(
-    graph_module: GraphModule,
-    inputs_inputs: List[fx.Node],
-    dequants_inputs: List[fx.Node],
-):
-    # Make the args and kwargs for the replacement op
-    args = tuple(inputs_inputs)
-
-    X_zero_point = graph_module.graph.call_function(
-        torch.ops.aten.full.default, ([1], dequants_inputs[0].args[2])
-    )
-
-    kwargs = {
-        "X_zero_point": X_zero_point,
-    }
-    return args, kwargs
-
-
-@dataclass
-class PartitionAnchors:
-    """
-    All fields except output are lists of (node, args_index) pair, where node is from
-    the given partition and node.args[args_index] is an input to the partition. Assumes
-    a single output.
-
-    Quantizer uses inputs, weights and biases for quantization annotation. The others
-    field contains tensor inputs that aren't quantized, and the literals fields contains
-    is used for other types of input values as well as handling default parameters.
-    """
-
-    inputs: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    weights: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    biases: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    others: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    literals: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    output: List[Union[Tuple[fx.Node], Tuple[fx.Node, QuantizationSpec]]] = field(
-        default_factory=list
-    )
-
-
-class QuantizationPattern(ABC):
-    @abstractmethod
-    def partition_types(self) -> List[Any]:
-        """
-        List of types to be passed to find_sequential_partitions.
-        """
-        pass
-
-    @abstractmethod
-    def get_anchors(self, gm, fused_partition) -> Optional[PartitionAnchors]:
-        pass
-
-    @abstractmethod
-    def replacement_op(self) -> Callable[..., Any]:
-        """
-        Operator (most likely a custom one) that this partition should be fused into in
-        the backend. Refer to the QuantFusion pass for examples.
-        """
-        pass
-
-
-class LinearPattern(QuantizationPattern):
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
-        return [torch.nn.Linear]
-
-    def get_anchors(
-        self, gm: GraphModule, fused_partition: List[GraphModule]
-    ) -> PartitionAnchors:
-        linear_node = fused_partition[0].nodes[-1]
-
-        # Keep bias empty if not supplied
-        bias = []
-        if len(linear_node.args) > 2:
-            bias = [(linear_node, 2)]
-
-        return PartitionAnchors(
-            inputs=[(linear_node, 0)],
-            weights=[(linear_node, 1)],
-            biases=bias,
-            output=[(linear_node,)],
-        )
-
-    def replacement_op(self):
-        return torch.ops.cadence.quantized_linear.default
-
-
-class LinearFunctionalPattern(QuantizationPattern):
-    def partition_types(self):
-        return [torch.nn.functional.linear]
-
-    def get_anchors(
-        self, gm: GraphModule, fused_partition: List[GraphModule]
-    ) -> PartitionAnchors:
-        linear_node = fused_partition[0].nodes[-1]
-
-        return PartitionAnchors(
-            inputs=[(linear_node, 0)],
-            weights=[(linear_node, 1)],
-            biases=[(linear_node, 2)],
-            output=[(linear_node,)],
-        )
-
-    def replacement_op(self):
-        return torch.ops.cadence.quantized_linear.default
-
-
-class LayerNormPattern(QuantizationPattern):
-    def partition_types(self):
-        return [torch.nn.LayerNorm]
-
-    def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
-        layer_norm_node = fused_partition[0].nodes[-1]
-
-        return PartitionAnchors(
-            inputs=[(layer_norm_node, 0)],
-            weights=[(layer_norm_node, 2)],
-            biases=[(layer_norm_node, 3)],
-            others=[(layer_norm_node, 1)],
-            output=[(layer_norm_node,)],
-        )
-
-    def replacement_op(self):
-        return torch.ops.cadence.quantized_layer_norm.default
-
-
-class Conv1dPattern(QuantizationPattern):
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
-        return [torch.nn.Conv1d]
-
-    def get_anchors(
-        self, gm: GraphModule, fused_partition: List[GraphModule]
-    ) -> PartitionAnchors:
-        conv1d_node = fused_partition[0].nodes[-1]
-
-        # If bias is None, replace it with an empty list.
-        bias = (
-            [(conv1d_node, 2)]
-            if len(conv1d_node.args) > 2 and conv1d_node.args[2]
-            else []
-        )
-
-        return PartitionAnchors(
-            inputs=[(conv1d_node, 0)],
-            weights=[(conv1d_node, 1)],
-            biases=bias,
-            output=[(conv1d_node,)],
-        )
-
-    def replacement_op(self):
-        return torch.ops.cadence.quantized_conv.default
-
-
-class Conv2dPattern(QuantizationPattern):
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
-        return [torch.nn.Conv2d]
-
-    def get_anchors(
-        self, gm: GraphModule, fused_partition: List[GraphModule]
-    ) -> PartitionAnchors:
-        conv2d_node = fused_partition[0].nodes[-1]
-
-        # If bias is None, replace it with an empty list.
-        bias = (
-            [(conv2d_node, 2)]
-            if len(conv2d_node.args) > 2 and conv2d_node.args[2]
-            else []
-        )
-
-        return PartitionAnchors(
-            inputs=[(conv2d_node, 0)],
-            weights=[(conv2d_node, 1)],
-            biases=bias,
-            output=[(conv2d_node,)],
-        )
-
-    def replacement_op(self):
-        return torch.ops.cadence.quantized_conv.default
-
-
-class AddmmPattern(QuantizationPattern):
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
-        return [torch.addmm]
-
-    def get_anchors(
-        self, gm: GraphModule, fused_partition: List[GraphModule]
-    ) -> PartitionAnchors:
-        addmm_node = fused_partition[0].nodes[-1]
-
-        return PartitionAnchors(
-            inputs=[(addmm_node, 1)],
-            weights=[(addmm_node, 2)],
-            biases=[(addmm_node, 0)],
-            output=[(addmm_node,)],
-        )
-
-    def replacement_op(self):
-        return torch.ops.cadence.quantized_linear.default
-
-
-class ReluPattern(QuantizationPattern):
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
-        return [torch.nn.ReLU]
-
-    def get_anchors(
-        self, gm: GraphModule, fused_partition: List[GraphModule]
-    ) -> PartitionAnchors:
-        relu_node = fused_partition[0].nodes[-1]
-
-        return PartitionAnchors(
-            inputs=[(relu_node, 0)],
-            weights=[],
-            biases=[],
-            # pyre-fixme[6]: Incompatible parameter type
-            output=[
-                (relu_node, SharedQuantizationSpec((relu_node.args[0], relu_node)))
-            ],
-        )
-
-    def replacement_op(self):
-        return torch.ops.cadence.quantized_relu.default
-
-
-class GenericQuantizer(Quantizer):
-    def __init__(self, pattern, quantization_config):
-        super().__init__()
-        self.pattern = pattern
-        self.quantization_config = quantization_config
-
-    def annotate(self, model):
-        fused_partitions = find_sequential_partitions(
-            model,
-            self.pattern.partition_types(),
-        )
-
-        input_act_qspec = self.quantization_config.input_activation
-        weight_qspec = self.quantization_config.weight
-        bias_qspec = self.quantization_config.bias
-        output_act_qspec = self.quantization_config.output_activation
-
-        for fused_partition in fused_partitions:
-            if not _no_outside_users(fused_partition):
-                continue
-
-            anchors = self.pattern.get_anchors(model, fused_partition)
-            if not anchors:
-                continue
-            if _is_annotated(
-                [
-                    x[0]
-                    for x in anchors.inputs
-                    + anchors.weights
-                    + anchors.biases
-                    + anchors.output
-                ]
-            ):
-                continue
-
-            for output, *custom_spec in anchors.output:
-                output.meta["quantization_annotation"] = QuantizationAnnotation(
-                    output_qspec=custom_spec[0] if custom_spec else output_act_qspec,
-                    _annotated=True,
-                )
-
-            def annotate_inputs(inputs, spec):
-                for node, idx in inputs:
-                    annotation = node.meta.get(
-                        "quantization_annotation",
-                        QuantizationAnnotation(_annotated=True),
-                    )
-                    annotation.input_qspec_map[node.args[idx]] = spec
-                    node.meta["quantization_annotation"] = annotation
-
-            annotate_inputs(anchors.inputs, input_act_qspec)
-            annotate_inputs(anchors.weights, weight_qspec)
-            annotate_inputs(anchors.biases, bias_qspec)
-
-    def validate(self, model: fx.GraphModule) -> None:
-        pass
-
-    @classmethod
-    def get_supported_operators(cls) -> List[OperatorConfig]:
-        return []
-
-
-act_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
-)
-
-wgt_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=MinMaxObserver,
-)
-
-
-class CadenceBaseQuantizer(ComposableQuantizer):
-    def __init__(self):
-        static_qconfig = QuantizationConfig(
-            act_qspec,
-            act_qspec,
-            wgt_qspec,
-            None,
-        )
-        static_qconfig_no_wgt = QuantizationConfig(
-            act_qspec,
-            act_qspec,
-            None,
-            None,
-        )
-        super().__init__(
-            [
-                GenericQuantizer(AddmmPattern(), static_qconfig),
-                GenericQuantizer(Conv1dPattern(), static_qconfig),
-                GenericQuantizer(Conv2dPattern(), static_qconfig),
-                GenericQuantizer(LayerNormPattern(), static_qconfig_no_wgt),
-                GenericQuantizer(LinearFunctionalPattern(), static_qconfig),
-                GenericQuantizer(LinearPattern(), static_qconfig),
-                GenericQuantizer(ReluPattern(), static_qconfig),
-            ]
-        )
-
-
-class QuantFusion(ExportPass):
-    def __init__(self, patterns):
-        super().__init__()
-        self.patterns = patterns
-
-    def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
-        for pattern in self.patterns:
-            fused_partitions = find_sequential_partitions(
-                graph_module,
-                pattern.partition_types(),
-            )
-            for fused_partition in fused_partitions:
-                anchors = pattern.get_anchors(graph_module, fused_partition)
-                if not anchors:
-                    continue
-                if any(self.is_fused(p.nodes) for p in fused_partition):
-                    continue
-
-                for p in fused_partition:
-                    self.mark_fused(p.nodes)
-
-                dequants_inputs = []
-                for node, idx in anchors.inputs:
-                    if (
-                        node.args[idx].target
-                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
-                    ):
-                        dequants_inputs.append(node.args[idx])
-                dequants_weights = []
-                for node, idx in anchors.weights:
-                    if (
-                        node.args[idx].target
-                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
-                    ):
-                        dequants_weights.append(node.args[idx])
-
-                inputs_inputs = [node.args[0] for node in dequants_inputs]
-                weights_inputs = [node.args[0] for node in dequants_weights]
-                weights_init_inputs = [node.args[idx] for node, idx in anchors.weights]
-                bias_inputs = [node.args[idx] for node, idx in anchors.biases]
-                other_inputs = [node.args[idx] for node, idx in anchors.others]
-
-                # The node is the first index of the list and first of the tuple
-                op_node = anchors.output[0][0]
-
-                assert len(op_node.users) == 1
-                quant_node = list(op_node.users.keys())[0]
-
-                with graph_module.graph.inserting_after(op_node):
-                    args = tuple(
-                        inputs_inputs + weights_inputs + other_inputs + bias_inputs
-                    )
-                    kwargs = {}
-                    if isinstance(pattern, Conv1dPattern) or isinstance(
-                        pattern, Conv2dPattern
-                    ):
-                        args, kwargs = get_args_and_kwargs_conv1d(
-                            graph_module,
-                            inputs_inputs,
-                            dequants_inputs,
-                            other_inputs,
-                            weights_inputs,
-                            dequants_weights,
-                            bias_inputs,
-                            quant_node,
-                            op_node,
-                        )
-                    elif isinstance(pattern, LinearPattern) or isinstance(
-                        pattern, LinearFunctionalPattern
-                    ):
-                        args, kwargs = get_args_and_kwargs_linear(
-                            graph_module,
-                            inputs_inputs,
-                            dequants_inputs,
-                            other_inputs,
-                            weights_inputs,
-                            dequants_weights,
-                            bias_inputs,
-                            quant_node,
-                        )
-                    elif isinstance(pattern, LayerNormPattern):
-                        args, kwargs = get_args_and_kwargs_layer_norm(
-                            graph_module,
-                            inputs_inputs,
-                            dequants_inputs,
-                            other_inputs,
-                            weights_init_inputs,
-                            bias_inputs,
-                            quant_node,
-                        )
-                    elif isinstance(pattern, AddmmPattern):
-                        # Transpose the weight tensor
-                        transposed_weights = graph_module.graph.call_function(
-                            torch.ops.aten.transpose.int,
-                            (weights_inputs[0], 0, 1),
-                        )
-                        # Call linear with transposed weight
-                        args, kwargs = get_args_and_kwargs_linear(
-                            graph_module,
-                            inputs_inputs,
-                            dequants_inputs,
-                            other_inputs,
-                            [transposed_weights],
-                            dequants_weights,
-                            bias_inputs,
-                            quant_node,
-                        )
-                    elif isinstance(pattern, ReluPattern):
-                        args, kwargs = get_args_and_kwargs_relu(
-                            graph_module,
-                            inputs_inputs,
-                            dequants_inputs,
-                        )
-                    fused = graph_module.graph.call_function(
-                        pattern.replacement_op(),
-                        args,
-                        kwargs,
-                    )
-                    fused.meta = quant_node.meta
-                    quant_node.replace_all_uses_with(fused)
-
-            legalize_graph(graph_module)
-            graph_module.graph.eliminate_dead_code()
-            # pyre-fixme[7]: Incompatible return type
-            graph_module.recompile()
-
-    @classmethod
-    def is_fused(cls, nodes) -> bool:
-        return any(cls.__qualname__ in n.meta for n in nodes)
-
-    @classmethod
-    def mark_fused(cls, nodes) -> bool:
-        for n in nodes:
-            # pyre-fixme[7]: Incompatible return type
-            n.meta["QuantFusion"] = True
-
-
-class ReplacePT2QuantWithCadenceQuant(ExportPass):
-    """
-    Replace the pt2 quantization ops with custom cadence quantization ops.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op not in {exir_ops.edge.quantized_decomposed.quantize_per_tensor.default}:
-            return super().call_operator(op, args, kwargs, meta)
-
-        return super().call_operator(
-            exir_ops.edge.cadence.quantize_per_tensor.default,
-            args,
-            kwargs,
-            meta,
-        )
-
-
-class ReplacePT2DequantWithCadenceDequant(ExportPass):
-    """
-    Replace the pt2 dequantization ops with custom cadence dequantization ops.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op not in {exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default}:
-            return super().call_operator(op, args, kwargs, meta)
-
-        return super().call_operator(
-            exir_ops.edge.cadence.dequantize_per_tensor.default,
-            args,
-            kwargs,
-            meta,
-        )
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
new file mode 100644
index 00000000000..69b12cf7a97
--- /dev/null
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -0,0 +1,437 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, List, Tuple
+
+import torch
+from executorch.backends.cadence.aot.quantizer.patterns import (
+    AddmmPattern,
+    Conv1dPattern,
+    Conv2dPattern,
+    LayerNormFunctionalPattern,
+    LayerNormPattern,
+    LinearFunctionalPattern,
+    LinearPattern,
+    MatmulPattern,
+    ReluPattern,
+)
+from executorch.backends.cadence.aot.quantizer.utils import (
+    create_zero_bias_int32,
+    get_conv_args,
+    quantize_tensor_multiplier,
+)
+from executorch.exir.pass_base import ExportPass
+from torch import fx
+from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
+from torch.fx import GraphModule
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.utils.fuser_utils import legalize_graph
+
+
+# Helper function to get the args and kwargs for the linear replacement op
+def get_args_and_kwargs_linear(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    quant_node: fx.Node,
+) -> Tuple[Tuple[Any], Dict[str, Any]]:
+    """
+    Returns the args and kwargs for the linear replacement op.
+    """
+    weight_scale = dequants_weights[0].args[1]
+    # pyre-fixme[58]: Unsupported operand types
+    bias_scale = dequants_inputs[0].args[1] * weight_scale
+    requantize_scale = bias_scale / quant_node.args[1]
+    requantize_scale_t = torch.tensor([requantize_scale])
+
+    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
+
+    # If bias is not available, create a bias tensor with the shape of weight[0]
+    if not bias_inputs:
+        weight_node = dequants_weights[0].args[0]
+        assert isinstance(weight_node, fx.Node)
+        bias = create_zero_bias_int32(graph_module, weight_node, bias_scale)
+    else:
+        bias = bias_inputs[0]
+
+    # Create single element tensors for weight_zero_point, out_multiplier, out_shift.
+    # Note that the function expects int32_t, when it would default to int64_t, so
+    # we explicitly require that type.
+    weight_zero_point_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_weights[0].args[2]),
+        {"dtype": torch.int32},
+    )
+    out_multiplier_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_multiplier[0].item()),
+        {"dtype": torch.int32},
+    )
+    out_shift_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_shift[0].item()),
+        {"dtype": torch.int32},
+    )
+
+    args = tuple(inputs_inputs + weights_inputs + [bias])
+    kwargs = {
+        "src_zero_point": dequants_inputs[0].args[2],
+        "weight_zero_point": weight_zero_point_,
+        "out_multiplier": out_multiplier_,
+        "out_shift": out_shift_,
+        "out_zero_point": quant_node.args[2],
+        "offset": None,
+    }
+    return args, kwargs
+
+
+# Helper function to get the args and kwargs for the layer norm replacement op
+def get_args_and_kwargs_layer_norm(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    other_inputs: List[fx.Node],
+    quant_node: fx.Node,
+) -> Tuple[Tuple[Any], Dict[str, Any]]:
+    """
+    Returns the args and kwargs for the layer norm replacement op.
+    """
+    # Check if the input is per-channel quantized
+    # TODO(matthiascremon): add proper support and testing for per-channel quantization
+    assert isinstance(dequants_inputs[0].args[1], float) and isinstance(
+        dequants_inputs[0].args[2], int
+    ), "per-channel quantization is not supported for layer norm, both scale and zero_point should be scalars"
+
+    # Make the scale and zero_point tensors
+    scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[1],
+        ),
+        {"dtype": torch.float32},
+    )
+    zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[2],
+        ),
+        {"dtype": torch.int32},
+    )
+
+    weight = other_inputs[1] if len(other_inputs) > 1 else None
+
+    if not weight:
+        weight = graph_module.graph.call_function(
+            torch.ops.aten.full.default,
+            (
+                other_inputs[0],
+                1,
+            ),
+            {"dtype": torch.float32},
+        )
+
+    bias = other_inputs[2] if len(other_inputs) > 2 else None
+
+    if not bias:
+        bias = graph_module.graph.call_function(
+            torch.ops.aten.full.default,
+            (
+                other_inputs[0],
+                0,
+            ),
+            {"dtype": torch.float32},
+        )
+
+    # Make the args and kwargs for the replacement op
+    args = tuple(inputs_inputs + [scale_tensor] + [zero_point_tensor])
+    kwargs = {
+        "normalized_shape": other_inputs[0],
+        "weight": weight,
+        "bias": bias,
+        "eps": 1e-05,
+        "output_scale": quant_node.args[1],
+        "output_zero_point": quant_node.args[2],
+    }
+    return args, kwargs
+
+
+def get_args_and_kwargs_matmul(
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    quant_node: fx.Node,
+) -> Tuple[Tuple[Any], Dict[str, Any]]:
+    requantize_scale = (
+        # pyre-ignore[58]: Unsupported operand
+        dequants_inputs[0].args[1]
+        * dequants_inputs[1].args[1]
+    ) / quant_node.args[1]
+    requantize_scale_t = torch.tensor([requantize_scale])
+
+    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
+
+    args = (
+        inputs_inputs[0],
+        dequants_inputs[0].args[2],
+        inputs_inputs[1],
+        dequants_inputs[1].args[2],
+        None,
+    )
+
+    kwargs = {
+        "out_multiplier": out_multiplier[0].item(),
+        "out_shift": out_shift[0].item(),
+        "out_zero_point": quant_node.args[2],
+        "transposed": False,
+    }
+    return args, kwargs
+
+
+def get_args_and_kwargs_conv(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    quant_node: fx.Node,
+    op_node: fx.Node,
+):
+    weight_scale = dequants_weights[0].args[1]
+    weight_zero_point = dequants_weights[0].args[2]
+    # pyre-fixme[58]: Unsupported operand types
+    bias_scale = dequants_inputs[0].args[1] * weight_scale
+    stride = [1, 1] if len(op_node.args) < 4 else get_conv_args(op_node.args[3], 1)
+    padding = [0, 0] if len(op_node.args) < 5 else get_conv_args(op_node.args[4], 0)
+    dilation = [1, 1] if len(op_node.args) < 6 else get_conv_args(op_node.args[5], 1)
+    groups = 1 if len(op_node.args) < 7 else op_node.args[6]
+
+    # If bias is not available, create a bias tensor with the shape of weight[0]
+    if not bias_inputs:
+        weight_node = dequants_weights[0].args[0]
+        assert isinstance(weight_node, fx.Node)
+        bias = create_zero_bias_int32(graph_module, weight_node, bias_scale)
+    else:
+        bias = bias_inputs[0]
+
+    # Compute the out multiplier and out shift. They are used when the conv op is
+    # replaced by quantized linear, we compute them a priori for simplicity but
+    # may revisit the decision.
+    requantize_scale = bias_scale / quant_node.args[1]
+    requantize_scale_t = torch.tensor([requantize_scale])
+
+    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
+
+    out_multiplier_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_multiplier[0].item()),
+        {"dtype": torch.int32},
+    )
+    out_shift_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_shift[0].item()),
+        {"dtype": torch.int32},
+    )
+
+    # Create a single element tensor for the weight zero point
+    weight_zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], weight_zero_point),
+        {"dtype": torch.int32},
+    )
+
+    # Create a single element tensor for the bias scale
+    bias_scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], bias_scale),
+        {"dtype": torch.float32},
+    )
+
+    # Make the args and kwargs for the replacement op
+    args = tuple(inputs_inputs + weights_inputs + [bias])
+    kwargs = {
+        "stride": stride,
+        "padding": padding,
+        "dilation": dilation,
+        "groups": groups,
+        "input_zero_point": dequants_inputs[0].args[2],
+        "weight_zero_point": weight_zero_point_tensor,
+        "bias_scale": bias_scale_tensor,
+        "out_scale": quant_node.args[1],
+        "out_zero_point": quant_node.args[2],
+        "out_multiplier": out_multiplier_,
+        "out_shift": out_shift_,
+        "channel_last": False,
+    }
+    return args, kwargs
+
+
+def get_args_and_kwargs_relu(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+):
+    # Make the args and kwargs for the replacement op
+    args = tuple(inputs_inputs)
+
+    X_zero_point = graph_module.graph.call_function(
+        torch.ops.aten.full.default, ([1], dequants_inputs[0].args[2])
+    )
+
+    kwargs = {
+        "X_zero_point": X_zero_point,
+    }
+    return args, kwargs
+
+
+class QuantFusion(ExportPass):
+    def __init__(self, patterns):
+        super().__init__()
+        self.patterns = patterns
+
+    def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
+        for pattern in self.patterns:
+            fused_partitions = find_sequential_partitions(
+                graph_module,
+                pattern.partition_types(),
+            )
+            for fused_partition in fused_partitions:
+                anchors = pattern.get_anchors(graph_module, fused_partition)
+                if not anchors:
+                    continue
+                if any(self.is_fused(p.nodes) for p in fused_partition):
+                    continue
+
+                for p in fused_partition:
+                    self.mark_fused(p.nodes)
+
+                dequants_inputs = []
+                for node, idx in anchors.inputs:
+                    if (
+                        node.args[idx].target
+                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    ):
+                        dequants_inputs.append(node.args[idx])
+                dequants_weights = []
+                for node, idx in anchors.weights:
+                    if (
+                        node.args[idx].target
+                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    ):
+                        dequants_weights.append(node.args[idx])
+                dequants_biases = []
+                for node, idx, *_spec in anchors.biases:
+                    if (
+                        node.args[idx].target
+                        == torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    ):
+                        dequants_biases.append(node.args[idx])
+
+                inputs_inputs = [node.args[0] for node in dequants_inputs]
+                weights_inputs = [node.args[0] for node in dequants_weights]
+                bias_inputs = [node.args[0] for node in dequants_biases]
+                other_inputs = [node.args[idx] for node, idx in anchors.others]
+
+                # The node is the first index of the list and first of the tuple
+                op_node = anchors.output[0][0]
+
+                assert len(op_node.users) == 1
+                quant_node = list(op_node.users.keys())[0]
+
+                with graph_module.graph.inserting_after(op_node):
+                    args = tuple(
+                        inputs_inputs + weights_inputs + other_inputs + bias_inputs
+                    )
+                    kwargs = {}
+                    if isinstance(pattern, Conv1dPattern) or isinstance(
+                        pattern, Conv2dPattern
+                    ):
+                        args, kwargs = get_args_and_kwargs_conv(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            quant_node,
+                            op_node,
+                        )
+                    elif isinstance(pattern, LinearPattern) or isinstance(
+                        pattern, LinearFunctionalPattern
+                    ):
+                        args, kwargs = get_args_and_kwargs_linear(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, LayerNormPattern) or isinstance(
+                        pattern, LayerNormFunctionalPattern
+                    ):
+                        args, kwargs = get_args_and_kwargs_layer_norm(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            other_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, MatmulPattern):
+                        args, kwargs = get_args_and_kwargs_matmul(
+                            inputs_inputs,
+                            dequants_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, AddmmPattern):
+                        # Transpose the weight tensor
+                        transposed_weights = graph_module.graph.call_function(
+                            torch.ops.aten.transpose.int,
+                            (weights_inputs[0], 0, 1),
+                        )
+                        # Call linear with transposed weight
+                        args, kwargs = get_args_and_kwargs_linear(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            [transposed_weights],
+                            dequants_weights,
+                            bias_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, ReluPattern):
+                        args, kwargs = get_args_and_kwargs_relu(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                        )
+                    fused = graph_module.graph.call_function(
+                        pattern.replacement_op(),
+                        args,
+                        kwargs,
+                    )
+                    fused.meta = quant_node.meta
+                    quant_node.replace_all_uses_with(fused)
+
+            legalize_graph(graph_module)
+            graph_module.graph.eliminate_dead_code()
+            # pyre-fixme[7]: Incompatible return type
+            graph_module.recompile()
+
+    @classmethod
+    def is_fused(cls, nodes) -> bool:
+        return any(cls.__qualname__ in n.meta for n in nodes)
+
+    @classmethod
+    def mark_fused(cls, nodes) -> bool:
+        for n in nodes:
+            # pyre-fixme[7]: Incompatible return type
+            n.meta["QuantFusion"] = True
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
new file mode 100644
index 00000000000..6df27982585
--- /dev/null
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -0,0 +1,344 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Callable, List, Optional, Tuple, Type, Union
+
+import torch
+from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams
+
+from torch import fx
+from torch.ao.quantization.quantizer import (
+    DerivedQuantizationSpec,
+    SharedQuantizationSpec,
+)
+
+
+@dataclass
+class PartitionAnchors:
+    """
+    All fields except output are lists of (node, args_index) pair, where node is from
+    the given partition and node.args[args_index] is an input to the partition. Assumes
+    a single output.
+
+    Quantizer uses inputs, weights and biases for quantization annotation. The others
+    field contains tensor inputs that aren't quantized, and the literals fields contains
+    is used for other types of input values as well as handling default parameters.
+    """
+
+    inputs: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    weights: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    biases: List[
+        Union[Tuple[fx.Node, int], Tuple[fx.Node, int, DerivedQuantizationSpec]]
+    ] = field(default_factory=list)
+    others: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    literals: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    output: List[Union[Tuple[fx.Node], Tuple[fx.Node, SharedQuantizationSpec]]] = field(
+        default_factory=list
+    )
+
+
+class QuantizationPattern(ABC):
+    @abstractmethod
+    def partition_types(self):
+        """
+        List of types to be passed to find_sequential_partitions.
+        """
+        pass
+
+    @abstractmethod
+    def get_anchors(self, gm, fused_partition) -> Optional[PartitionAnchors]:
+        pass
+
+    @abstractmethod
+    def replacement_op(self) -> Callable[..., Any]:
+        """
+        Operator (most likely a custom one) that this partition should be fused into in
+        the backend. Refer to the QuantFusion pass for examples.
+        """
+        pass
+
+
+class AddmmPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.addmm]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        addmm_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (addmm_node.args[1], addmm_node),
+                (addmm_node.args[2], addmm_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        return PartitionAnchors(
+            inputs=[(addmm_node, 1)],
+            weights=[(addmm_node, 2)],
+            biases=[(addmm_node, 0, bias_qspec)],
+            output=[(addmm_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_linear
+
+
+class Conv1dPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.Conv1d]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        conv1d_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (conv1d_node.args[0], conv1d_node),
+                (conv1d_node.args[1], conv1d_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None:
+            bias = [(conv1d_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(conv1d_node, 0)],
+            weights=[(conv1d_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(conv1d_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_conv.default
+
+
+class Conv2dPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.Conv2d]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        conv2d_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (conv2d_node.args[0], conv2d_node),
+                (conv2d_node.args[1], conv2d_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None:
+            bias = [(conv2d_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(conv2d_node, 0)],
+            weights=[(conv2d_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(conv2d_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_conv.default
+
+
+class LayerNormPattern(QuantizationPattern):
+    def partition_types(self):
+        return [torch.nn.LayerNorm]
+
+    def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
+        layer_norm_node = fused_partition[0].nodes[-1]
+
+        # Weights and biases are used as fp32 by our kernel, so they are
+        # passed in as others here along with the normalized shape.
+        return PartitionAnchors(
+            inputs=[(layer_norm_node, 0)],
+            weights=[],
+            biases=[],
+            # Ordering: normalized_shape, weights, bias
+            others=[(layer_norm_node, 1), (layer_norm_node, 2), (layer_norm_node, 3)],
+            output=[(layer_norm_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_layer_norm.default
+
+
+class LayerNormFunctionalPattern(QuantizationPattern):
+    def partition_types(self):
+        return [torch.nn.functional.layer_norm]
+
+    def get_anchors(self, gm, fused_partition) -> PartitionAnchors:
+        layer_norm_node = fused_partition[0].nodes[-1]
+
+        others = [(layer_norm_node, 1)]
+
+        # Add weights if supplied
+        if len(layer_norm_node.args) > 2 and layer_norm_node.args[2]:
+            others.append((layer_norm_node, 2))
+
+        # Add bias if supplied
+        if len(layer_norm_node.args) > 3 and layer_norm_node.args[3]:
+            others.append((layer_norm_node, 3))
+
+        # Weights are used in quantized mode by our kernel, so they are
+        # passed in as others here along with the normalized shape.
+        return PartitionAnchors(
+            inputs=[(layer_norm_node, 0)],
+            weights=[],
+            biases=[],
+            # Ordering: normalized_shape, weights, bias
+            # pyre-fixme[6]: Incompatible parameter type
+            others=others,
+            output=[(layer_norm_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_layer_norm.default
+
+
+class LinearPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.Linear]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        linear_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (linear_node.args[0], linear_node),
+                (linear_node.args[1], linear_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(linear_node.args) > 2:
+            bias = [(linear_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(linear_node, 0)],
+            weights=[(linear_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(linear_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_linear.default
+
+
+class LinearFunctionalPattern(QuantizationPattern):
+    def partition_types(self):
+        return [torch.nn.functional.linear]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        linear_node = fused_partition[0].nodes[-1]
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (linear_node.args[0], linear_node),
+                (linear_node.args[1], linear_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(linear_node.args) > 2 and linear_node.args[2] is not None:
+            bias = [(linear_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(linear_node, 0)],
+            weights=[(linear_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(linear_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_linear.default
+
+
+class MatmulPattern(QuantizationPattern):
+    def partition_types(self):
+        return [torch.matmul]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        matmul_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(matmul_node, 0), (matmul_node, 1)],
+            weights=[],
+            biases=[],
+            output=[(matmul_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_matmul.default
+
+
+class ReluPattern(QuantizationPattern):
+    def partition_types(self) -> List[Type[torch.nn.Module]]:
+        return [torch.nn.ReLU]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        relu_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(relu_node, 0)],
+            weights=[],
+            biases=[],
+            output=[
+                (relu_node, SharedQuantizationSpec((relu_node.args[0], relu_node)))
+            ],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_relu.default
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
new file mode 100644
index 00000000000..79e6fb28149
--- /dev/null
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -0,0 +1,145 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+
+import torch
+from executorch.backends.cadence.aot.quantizer.patterns import (
+    AddmmPattern,
+    Conv1dPattern,
+    Conv2dPattern,
+    LayerNormFunctionalPattern,
+    LayerNormPattern,
+    LinearFunctionalPattern,
+    LinearPattern,
+    MatmulPattern,
+    ReluPattern,
+)
+from executorch.backends.cadence.aot.quantizer.utils import (
+    is_annotated,
+    no_outside_users,
+)
+
+from torch import fx
+
+from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
+from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
+from torch.ao.quantization.quantizer import Quantizer
+from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    OperatorConfig,
+    QuantizationAnnotation,
+    QuantizationConfig,
+    QuantizationSpec,
+)
+
+
+act_qspec = QuantizationSpec(
+    dtype=torch.uint8,
+    quant_min=0,
+    quant_max=255,
+    qscheme=torch.per_tensor_affine,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
+)
+
+wgt_qspec = QuantizationSpec(
+    dtype=torch.uint8,
+    quant_min=0,
+    quant_max=255,
+    qscheme=torch.per_tensor_affine,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=MinMaxObserver,
+)
+
+bias_qspec = None
+
+
+class CadenceGenericQuantizer(Quantizer):
+    def __init__(self, pattern, quantization_config):
+        super().__init__()
+        self.pattern = pattern
+        self.quantization_config = quantization_config
+
+    def annotate(self, model):
+        fused_partitions = find_sequential_partitions(
+            model,
+            self.pattern.partition_types(),
+        )
+
+        input_act_qspec = self.quantization_config.input_activation
+        weight_qspec = self.quantization_config.weight
+        bias_qspec = self.quantization_config.bias
+        output_act_qspec = self.quantization_config.output_activation
+
+        for fused_partition in fused_partitions:
+            if not no_outside_users(fused_partition):
+                continue
+
+            anchors = self.pattern.get_anchors(model, fused_partition)
+            if not anchors:
+                continue
+            if is_annotated(
+                [
+                    x[0]
+                    for x in anchors.inputs
+                    + anchors.weights
+                    + anchors.biases
+                    + anchors.output
+                ]
+            ):
+                continue
+
+            for output, *custom_spec in anchors.output:
+                output.meta["quantization_annotation"] = QuantizationAnnotation(
+                    output_qspec=custom_spec[0] if custom_spec else output_act_qspec,
+                    _annotated=True,
+                )
+
+            def annotate_inputs(inputs, spec):
+                for node, idx, *custom_spec in inputs:
+                    annotation = node.meta.get(
+                        "quantization_annotation",
+                        QuantizationAnnotation(_annotated=True),
+                    )
+                    annotation.input_qspec_map[node.args[idx]] = (
+                        custom_spec[0] if custom_spec else spec
+                    )
+                    node.meta["quantization_annotation"] = annotation
+
+            annotate_inputs(anchors.inputs, input_act_qspec)
+            annotate_inputs(anchors.weights, weight_qspec)
+            annotate_inputs(anchors.biases, bias_qspec)
+
+    def validate(self, model: fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> List[OperatorConfig]:
+        return []
+
+
+class CadenceQuantizer(ComposableQuantizer):
+    def __init__(self):
+        static_qconfig = QuantizationConfig(
+            act_qspec,
+            act_qspec,
+            wgt_qspec,
+            None,
+        )
+        super().__init__(
+            [
+                CadenceGenericQuantizer(AddmmPattern(), static_qconfig),
+                CadenceGenericQuantizer(Conv1dPattern(), static_qconfig),
+                CadenceGenericQuantizer(Conv2dPattern(), static_qconfig),
+                CadenceGenericQuantizer(LayerNormPattern(), static_qconfig),
+                CadenceGenericQuantizer(LayerNormFunctionalPattern(), static_qconfig),
+                CadenceGenericQuantizer(LinearPattern(), static_qconfig),
+                CadenceGenericQuantizer(LinearFunctionalPattern(), static_qconfig),
+                CadenceGenericQuantizer(MatmulPattern(), static_qconfig),
+                CadenceGenericQuantizer(ReluPattern(), static_qconfig),
+            ]
+        )
diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py
new file mode 100644
index 00000000000..21dac6b0b0f
--- /dev/null
+++ b/backends/cadence/aot/quantizer/utils.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from math import frexp, isclose, trunc
+from typing import List, Tuple
+
+import torch
+from torch import fx
+from torch.ao.quantization import ObserverOrFakeQuantize
+
+from torch.fx import GraphModule
+
+
+def quantize_tensor_multiplier(
+    requantize_scale_tensor: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Given requantize_scale_tensor with values in the interval (0, 1),
+    produce a pair of tensors (out_multiplier, right_shift) where out_multiplier
+    is an int32 tensor representing fixed-point values in the interval [-1, 1),
+    and right_shift is an amount to shift right by, so that the floating-point
+    multiplication of some int32 input with each value of requantize_scale_tensor:
+        result = int32_value * requantize_scale_tensors[i]
+    is best approximated by the integer-arithmetic-only code:
+        result = RoundingRightShift(FixedPointMultiplication(int32_value,
+                                    out_multiplier[i]), right_shift[i])
+    """
+
+    # This is identical to C++11 std::round(). The general python round rounds
+    # down, and C++ rounds away from zero.
+    def round_away_zero(f) -> int:
+        r = -0.5 if (f < 0) else 0.5
+        return trunc(f + r)
+
+    def quantize_scalar_multiplier(requantize_scale: float) -> Tuple[int, int]:
+        significand, exponent = frexp(requantize_scale)
+        significand_q31 = int(round_away_zero(significand * (1 << 31)))
+        # Handle the special case when the real multiplier was so close to 1
+        # that its fixed-point approximation was indistinguishable from 1.
+        # We handle this by dividing it by two, incrementing exponent by 1.
+        # the right shift amount.
+        if significand_q31 == (1 << 31):
+            significand_q31 //= 2
+            exponent += 1
+
+        # Verify that the decomposition of requantize_scale into significand
+        # and exponent is correct.
+        reconstructed = significand_q31 / (1 << 31) * pow(2, exponent)
+        assert isclose(
+            requantize_scale, reconstructed, rel_tol=1e-4, abs_tol=1e-4
+        ), "computation of significand and exponent from requantize_scale is not accurate"
+
+        return (significand_q31, exponent)
+
+    # Flatten the input scale tensor so that we can operate on individual values
+    orig_shape = requantize_scale_tensor.shape
+    flattened_tensor = requantize_scale_tensor.flatten().to(torch.float32)
+    out_multiplier = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
+    right_shift = torch.zeros(flattened_tensor.shape, dtype=torch.int32)
+
+    # Iterate over the flattened scale tensor and compute the decomposition of
+    # each value in scale tensor into significand(out_multiplier) and
+    # exponent(right_shift)
+    for idx, scale in enumerate(flattened_tensor):
+        (si, ex) = quantize_scalar_multiplier(scale)
+        out_multiplier[idx], right_shift[idx] = si, ex
+
+    # Reshape the tensors back to the original shape
+    out_multiplier = out_multiplier.reshape(orig_shape)
+    right_shift = right_shift.reshape(orig_shape)
+
+    return (out_multiplier, right_shift)
+
+
+def is_annotated(nodes: List[fx.Node]) -> bool:
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def no_outside_users(fused_partition) -> bool:
+    """
+    Checks if each partition other than the last does not have any outside users.
+    """
+    for source_partition in fused_partition[:-1]:
+        if len(source_partition.output_nodes) != 1:
+            return False
+        if len(source_partition.output_nodes[0].users) != 1:
+            return False
+    return True
+
+
+def create_zero_bias_int32(
+    graph_module: GraphModule,
+    weight_node: fx.Node,
+    bias_scale: float,
+) -> fx.Node:
+    """
+    Creates a zero bias tensor with the shape of weight[0]
+    """
+    attr_node = getattr(graph_module, weight_node.target)
+    weight_shape = list(attr_node.shape)
+    bias_shape = weight_shape[0]
+    return graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([bias_shape], 0.0),
+        {"dtype": torch.int32},
+    )
+
+
+def get_bias_qparams(
+    obs_or_fqs: List[ObserverOrFakeQuantize],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    act_scale, _ = obs_or_fqs[0].calculate_qparams()
+    weight_scale, _ = obs_or_fqs[1].calculate_qparams()
+    bias_scale = act_scale * weight_scale
+    bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int32)
+    return bias_scale, bias_zero_point
+
+
+def get_conv_args(arg, first_val: int) -> List[fx.Node]:
+    return arg if len(arg) == 2 else [first_val, arg[0]]

From 9db0a692893e5a2d31b033f2a5efa3b6797b2a51 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 13 May 2024 07:20:55 -0700
Subject: [PATCH 36/62] Simplify gen_oplist_copy_from_core file (#3549)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3549

Looks like we just need the yaml file. That means, we can get rid off gen_supported_mobile_models and write_selected_mobile_ops and all its dependencies.

Reviewed By: lucylq

Differential Revision: D57122006

fbshipit-source-id: 07a0aafb686237cae29d774e552eddccce797136
---
 codegen/tools/gen_all_oplist.py            |   5 +-
 codegen/tools/gen_oplist_copy_from_core.py | 174 +--------------------
 2 files changed, 3 insertions(+), 176 deletions(-)

diff --git a/codegen/tools/gen_all_oplist.py b/codegen/tools/gen_all_oplist.py
index 6626803df74..ec02ff7ec3c 100644
--- a/codegen/tools/gen_all_oplist.py
+++ b/codegen/tools/gen_all_oplist.py
@@ -22,10 +22,7 @@ def main(argv: List[Any]) -> None:
     parser = argparse.ArgumentParser(description="Generate operator lists")
     parser.add_argument(
         "--output_dir",
-        help=(
-            "The directory to store the output yaml files (selected_mobile_ops.h, "
-            + "selected_kernel_dtypes.h, selected_operators.yaml)"
-        ),
+        help=("The directory to store the output yaml file (selected_operators.yaml)"),
         required=True,
     )
     parser.add_argument(
diff --git a/codegen/tools/gen_oplist_copy_from_core.py b/codegen/tools/gen_oplist_copy_from_core.py
index 452f84ba572..34a8af245bb 100644
--- a/codegen/tools/gen_oplist_copy_from_core.py
+++ b/codegen/tools/gen_oplist_copy_from_core.py
@@ -5,138 +5,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# This is a copy from //xplat/caffe2/tools/code_analyzer/gen_oplist.py
-# TODO(mnachin): We will need to either simplify or remove this code altogether.
-# This is necessary to remove dependency from pytorch core from ExecuTorch.
+# This is a simplified copy from //xplat/caffe2/tools/code_analyzer/gen_oplist.py
 import argparse
-import json
 import os
 import sys
 from functools import reduce
-from typing import Any, List, Set
+from typing import Any, List
 
 import yaml
-from torchgen.code_template import CodeTemplate
 from torchgen.selective_build.selector import (
     combine_selective_builders,
     SelectiveBuilder,
 )
 
-if_condition_template_str = """if (kernel_tag_sv.compare("$kernel_tag_name") == 0) {
-  return $dtype_checks;
-}"""
-if_condition_template = CodeTemplate(if_condition_template_str)
-
-selected_kernel_dtypes_h_template_str = """
-#include <c10/core/ScalarType.h>
-#include <c10/util/string_view.h>
-#include <c10/macros/Macros.h>
-
-namespace at {
-inline constexpr bool should_include_kernel_dtype(
-  const char *kernel_tag_str,
-  at::ScalarType scalar_type
-) {
-  c10::string_view kernel_tag_sv C10_UNUSED = c10::string_view(kernel_tag_str);
-  $body
-  return false;
-}
-}
-"""
-selected_kernel_dtypes_h_template = CodeTemplate(selected_kernel_dtypes_h_template_str)
-
-selected_mobile_ops_preamble = """#pragma once
-/**
- * Generated by gen_selected_mobile_ops_header.py
- */
-
-"""
-
-
-def get_selected_kernel_dtypes_code(
-    selective_builder: SelectiveBuilder,
-) -> str:
-    # See https://www.internalfb.com/intern/paste/P153411698/ for an example of the
-    # generated code in case all kernel dtypes are selected and in case some kernel
-    # dtypes are selected (i.e. both cases).
-    #
-    body = "return true;"
-    if (
-        selective_builder.include_all_operators is False
-        and selective_builder.include_all_non_op_selectives is False
-    ):
-        body_parts = []
-        for kernel_tag, dtypes in selective_builder.kernel_metadata.items():
-            conditions = ["scalar_type == at::ScalarType::" + x for x in dtypes]
-            body_parts.append(
-                if_condition_template.substitute(
-                    kernel_tag_name=kernel_tag,
-                    dtype_checks=" || ".join(conditions),
-                ),
-            )
-        body = " else ".join(body_parts)
-
-    header_contents = selected_kernel_dtypes_h_template.substitute(body=body)
-    return header_contents
-
-
-def extract_root_operators(selective_builder: SelectiveBuilder) -> Set[str]:
-    ops = []
-    for op_name, op in selective_builder.operators.items():
-        if op.is_root_operator:
-            ops.append(op_name)
-    return set(ops)
-
-
-# Write the file selected_mobile_ops.h with optionally:
-# 1. The selected root operators
-# 2. The selected kernel dtypes
-def write_selected_mobile_ops(
-    output_file_path: str,
-    selective_builder: SelectiveBuilder,
-) -> None:
-    root_ops = extract_root_operators(selective_builder)
-    custom_classes = selective_builder.custom_classes
-    build_features = selective_builder.build_features
-    with open(output_file_path, "wb") as out_file:
-        body_parts = [selected_mobile_ops_preamble]
-        # This condition checks if we are in selective build.
-        # if these lists are not defined the corresponding selective build macros trivially return the item in question was selected
-        if not selective_builder.include_all_operators:
-            body_parts.append(
-                "#define TORCH_OPERATOR_WHITELIST "
-                + (";".join(sorted(root_ops)))
-                + ";\n\n"
-            )
-            # This condition checks if we are in tracing based selective build
-            if selective_builder.include_all_non_op_selectives is False:
-                body_parts.append(
-                    "#define TORCH_CUSTOM_CLASS_ALLOWLIST "
-                    + (";".join(sorted(custom_classes)))
-                    + ";\n\n"
-                )
-                body_parts.append(
-                    "#define TORCH_BUILD_FEATURE_ALLOWLIST "
-                    + (";".join(sorted(build_features)))
-                    + ";\n\n"
-                )
-
-        body_parts.append(get_selected_kernel_dtypes_code(selective_builder))
-        header_contents = "".join(body_parts)
-        out_file.write(header_contents.encode("utf-8"))
-
-
-def extract_all_operators(selective_builder: SelectiveBuilder) -> Set[str]:
-    return set(selective_builder.operators.keys())
-
-
-def extract_training_operators(selective_builder: SelectiveBuilder) -> Set[str]:
-    ops = []
-    for op_name, op in selective_builder.operators.items():
-        if op.is_used_for_training:
-            ops.append(op_name)
-    return set(ops)
-
 
 def throw_if_any_op_includes_overloads(selective_builder: SelectiveBuilder) -> None:
     ops = []
@@ -153,49 +34,6 @@ def throw_if_any_op_includes_overloads(selective_builder: SelectiveBuilder) -> N
         )
 
 
-def gen_supported_mobile_models(model_dicts: List[Any], output_dir: str) -> None:
-    supported_mobile_models_source = """/*
- * Generated by gen_oplist.py
- */
-#include "fb/supported_mobile_models/SupportedMobileModels.h"
-
-
-struct SupportedMobileModelCheckerRegistry {{
-  SupportedMobileModelCheckerRegistry() {{
-    auto& ref = facebook::pytorch::supported_model::SupportedMobileModelChecker::singleton();
-    ref.set_supported_md5_hashes(std::unordered_set<std::string>{{
-      {supported_hashes_template}
-    }});
-  }}
-}};
-
-// This is a global object, initializing which causes the registration to happen.
-SupportedMobileModelCheckerRegistry register_model_versions;
-
-
-"""
-
-    # Generate SupportedMobileModelsRegistration.cpp
-    md5_hashes = set()
-    for model_dict in model_dicts:
-        if "debug_info" in model_dict:
-            debug_info = json.loads(model_dict["debug_info"][0])
-            if debug_info["is_new_style_rule"]:
-                for asset_info in debug_info["asset_info"].values():
-                    md5_hashes.update(asset_info["md5_hash"])
-
-    supported_hashes = ""
-    for md5 in md5_hashes:
-        supported_hashes += f'"{md5}",\n'
-    with open(
-        os.path.join(output_dir, "SupportedMobileModelsRegistration.cpp"), "wb"
-    ) as out_file:
-        source = supported_mobile_models_source.format(
-            supported_hashes_template=supported_hashes
-        )
-        out_file.write(source.encode("utf-8"))
-
-
 def main(argv: List[Any]) -> None:
     """This binary generates 3 files:
 
@@ -258,9 +96,6 @@ def main(argv: List[Any]) -> None:
 
     selective_builders = [SelectiveBuilder.from_yaml_dict(m) for m in model_dicts]
 
-    # While we have the model_dicts generate the supported mobile models api
-    gen_supported_mobile_models(model_dicts, options.output_dir)
-
     # We may have 0 selective builders since there may not be any viable
     # pt_operator_library rule marked as a dep for the pt_operator_registry rule.
     # This is potentially an error, and we should probably raise an assertion
@@ -283,11 +118,6 @@ def main(argv: List[Any]) -> None:
             ).encode("utf-8"),
         )
 
-    write_selected_mobile_ops(
-        os.path.join(options.output_dir, "selected_mobile_ops.h"),
-        selective_builder,
-    )
-
 
 if __name__ == "__main__":
     main(sys.argv[1:])

From c229e7b2c4ca9676fbb76b226004485f998e2a54 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 13 May 2024 10:47:40 -0700
Subject: [PATCH 37/62] Update README.md with correct link (#3591)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/3591

Reviewed By: lucylq

Differential Revision: D57277638

Pulled By: mergennachin

fbshipit-source-id: 0824829f5da74a30b08da635703130f8b997042a
---
 examples/models/llama2/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
index 74b6e8e1343..8fc77dd72ba 100644
--- a/examples/models/llama2/README.md
+++ b/examples/models/llama2/README.md
@@ -292,7 +292,7 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 
 ## Optional: Smaller models delegated to other backends
 Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
-for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
+for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
 
 - Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --coreml -c stories110M.pt -p params.json`
 - MPS: `python -m examples.models.llama2.export_llama -kv --mps -c stories110M.pt -p params.json`

From a88a78d39cb1bc88a92f4d40075d4e5324448a39 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 13 May 2024 11:02:54 -0700
Subject: [PATCH 38/62] Separate mm and addmm into separate implementation
 files + cleanup (#3592)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3592

* Separate `mm` and `addmm` shaders and C++ implementations to improve readability. This is to prepare for the introduction of `aten.linear` which will add a further degree of configurability to these ops.
* Introduce `packed_dim_meta_ubo()` to store metadata about the packed dimension of a tensor. This simplifies the C++ code for dispatching the optimized shaders and also fixes dynamic shape for the optimized shader. The problem with the previous approach of passing in `batches, step_size, remainder` is that these values would not be updated upon a tensor reshape.
* Introduce various `property_of(ValueRef idx)` utility functions to `ComputeGraph`. This makes it so that we don't have to declare `vTensorPtr` variables like the below which can be confusing (cc: yipjustin, copyrightly, jorgep31415)

```
vTensorPtr t_mat1 = graph.get_tensor(mat1);

// now we can just do
auto mat1_sizes = graph.sizes_of(mat1);
auto sizes_ubo = graph.sizes_ubo(mat1);
```

* Rename some loop variables in optimized matmul shader to improve readability

Reviewed By: yipjustin, liuk22

Differential Revision: D57203867

fbshipit-source-id: 159963fbe050a076696eafac35f8fdde8539ca23
---
 backends/vulkan/runtime/api/Tensor.cpp        |  28 ++
 backends/vulkan/runtime/api/Tensor.h          |  17 +
 backends/vulkan/runtime/api/Utils.h           |  11 +
 .../vulkan/runtime/graph/ComputeGraph.cpp     |  75 ++--
 backends/vulkan/runtime/graph/ComputeGraph.h  |  61 ++-
 .../vulkan/runtime/graph/containers/Value.h   |   8 +
 .../runtime/graph/ops/glsl/addmm_naive.glsl   |  80 ++++
 .../runtime/graph/ops/glsl/addmm_naive.yaml   |  20 +
 .../graph/ops/glsl/addmm_optimized.glsl       |  86 ++++
 .../graph/ops/glsl/addmm_optimized.yaml       |  17 +
 .../vulkan/runtime/graph/ops/glsl/matmul.h    |  80 ++--
 .../runtime/graph/ops/glsl/matmul_naive.glsl  |  71 ++--
 .../runtime/graph/ops/glsl/matmul_naive.yaml  |   6 -
 .../graph/ops/glsl/matmul_optimized.glsl      |  73 +---
 .../graph/ops/glsl/matmul_optimized.yaml      |   3 -
 .../runtime/graph/ops/impl/Convolution.cpp    |  15 +-
 .../vulkan/runtime/graph/ops/impl/Linear.cpp  | 211 ++++++++++
 .../vulkan/runtime/graph/ops/impl/MatMul.cpp  | 389 ++++--------------
 .../graph/ops/utils/ShaderNameUtils.cpp       |  24 +-
 .../runtime/graph/ops/utils/ShaderNameUtils.h |   5 +
 20 files changed, 759 insertions(+), 521 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Linear.cpp

diff --git a/backends/vulkan/runtime/api/Tensor.cpp b/backends/vulkan/runtime/api/Tensor.cpp
index cb4e0848191..4148601ee78 100644
--- a/backends/vulkan/runtime/api/Tensor.cpp
+++ b/backends/vulkan/runtime/api/Tensor.cpp
@@ -143,6 +143,7 @@ vTensor::vTensor(
       // Utility Uniform Buffers that can be passed to shaders as arguments
       sizes_uniform_(),
       texture_limits_uniform_(),
+      packed_dim_meta_(),
       // Construct Tensor storage
       storage_(
           context,
@@ -212,6 +213,30 @@ const api::BufferBindInfo vTensor::texture_limits_ubo() {
   return api::BufferBindInfo(texture_limits_uniform_.buffer());
 }
 
+vTensor::PackedDimMeta vTensor::make_packed_dim_metadata() const {
+  int64_t packed_dim = gpu_memory_layout_int();
+  int32_t dim_size = api::utils::val_at(-(packed_dim + 1), sizes_);
+  int32_t dim_size_padded = api::utils::val_at(-(packed_dim + 1), gpu_sizes_);
+  int32_t dim_texel_len =
+      api::utils::safe_downcast<int32_t>(extents().data[packed_dim]);
+  int32_t padding = dim_size_padded - dim_size;
+
+  return {
+      dim_size,
+      dim_size_padded,
+      dim_texel_len,
+      padding,
+  };
+}
+
+const api::BufferBindInfo vTensor::packed_dim_meta_ubo() {
+  if (!packed_dim_meta_.buffer()) {
+    packed_dim_meta_ =
+        api::UniformParamsBuffer(storage_.context_, make_packed_dim_metadata());
+  }
+  return api::BufferBindInfo(packed_dim_meta_.buffer());
+}
+
 VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
   switch (storage_type()) {
     case api::kBuffer:
@@ -268,6 +293,9 @@ void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
   if (texture_limits_uniform_.buffer()) {
     texture_limits_uniform_.update(texture_limits_);
   }
+  if (packed_dim_meta_.buffer()) {
+    packed_dim_meta_.update(make_packed_dim_metadata());
+  }
 }
 
 void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
index 0ddd5d9a4f1..cb0fad76eb6 100644
--- a/backends/vulkan/runtime/api/Tensor.h
+++ b/backends/vulkan/runtime/api/Tensor.h
@@ -117,6 +117,13 @@ class vTensor final {
   vTensor& operator=(vTensor&& other) = default;
 
  private:
+  struct PackedDimMeta {
+    int32_t dim_size;
+    int32_t dim_size_padded;
+    int32_t dim_texel_len;
+    int32_t padding;
+  };
+
   api::ScalarType dtype_;
   api::GPUMemoryLayout memory_layout_;
 
@@ -134,6 +141,10 @@ class vTensor final {
   // tensor has been resized with `virtual_resize()`.
   api::UniformParamsBuffer texture_limits_uniform_;
 
+  // A Vulkan uniform buffer containing an instance of PackedDimMeta which
+  // describes how the tensor's packed dimension is padded.
+  api::UniformParamsBuffer packed_dim_meta_;
+
   vTensorStorage storage_;
 
  public:
@@ -220,6 +231,12 @@ class vTensor final {
    */
   const api::BufferBindInfo texture_limits_ubo();
 
+ private:
+  vTensor::PackedDimMeta make_packed_dim_metadata() const;
+
+ public:
+  const api::BufferBindInfo packed_dim_meta_ubo();
+
   inline const api::utils::ivec3 texture_limits() const {
     return texture_limits_.limits;
   }
diff --git a/backends/vulkan/runtime/api/Utils.h b/backends/vulkan/runtime/api/Utils.h
index ca36f7f75c6..26fe4ac075e 100644
--- a/backends/vulkan/runtime/api/Utils.h
+++ b/backends/vulkan/runtime/api/Utils.h
@@ -279,6 +279,17 @@ inline std::ostream& operator<<(std::ostream& os, const ivec4& v) {
   return os;
 }
 
+template <typename T, uint32_t N>
+inline detail::vec<T, N> divup_vec(
+    const detail::vec<T, N>& a,
+    const detail::vec<T, N>& b) {
+  detail::vec<T, N> result;
+  for (uint32_t i = 0; i < N; ++i) {
+    result.data[i] = api::utils::div_up(a.data[i], b.data[i]);
+  }
+  return result;
+}
+
 //
 // std::vector<T> Handling
 //
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index cc91e6b2dc6..aa34aae6771 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -89,33 +89,6 @@ ComputeGraph::~ComputeGraph() {
   context_->flush();
 }
 
-void ComputeGraph::update_descriptor_counts(
-    const api::ShaderInfo& shader_info,
-    bool execute) {
-  api::DescriptorPoolConfig* config =
-      execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_;
-
-  config->descriptor_pool_max_sets += 1;
-  for (const VkDescriptorType arg_type : shader_info.kernel_layout) {
-    switch (arg_type) {
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-        config->descriptor_uniform_buffer_count += 1;
-        break;
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-        config->descriptor_storage_buffer_count += 1;
-        break;
-      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-        config->descriptor_combined_sampler_count += 1;
-        break;
-      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-        config->descriptor_storage_image_count += 1;
-        break;
-      default:
-        VK_THROW("Unsupported descriptor type!");
-    }
-  }
-}
-
 api::StorageType ComputeGraph::suggested_storage_type() {
   if (config_.enable_storage_type_override) {
     return config_.storage_type_override;
@@ -148,22 +121,22 @@ void ComputeGraph::check_no_active_value_ptrs() {
       "invalidated.");
 }
 
-std::vector<int64_t> ComputeGraph::get_sizes_of(ValueRef idx) {
-  Value& val = values_.at(idx);
+std::vector<int64_t> ComputeGraph::sizes_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
   if (val.isTensor()) {
-    return val.toTensor().sizes();
+    return val.toConstTensor().sizes();
   } else if (val.isTensorRef()) {
-    return val.toTensorRef().sizes;
+    return val.toConstTensorRef().sizes;
   }
   VK_THROW("Could not get sizes of value with type ", val.type());
 }
 
-api::ScalarType ComputeGraph::get_dtype_of(ValueRef idx) {
-  Value& val = values_.at(idx);
+api::ScalarType ComputeGraph::dtype_of(const ValueRef idx) const {
+  const Value& val = values_.at(idx);
   if (val.isTensor()) {
-    return val.toTensor().dtype();
+    return val.toConstTensor().dtype();
   } else if (val.isTensorRef()) {
-    return val.toTensorRef().dtype;
+    return val.toConstTensorRef().dtype;
   }
   VK_THROW("Could not get dtype of value with type ", val.type());
 }
@@ -200,14 +173,13 @@ ValueRef ComputeGraph::add_tensor_like(
     const ValueRef idx,
     const api::StorageType storage_type,
     const api::GPUMemoryLayout memory_layout) {
-  return add_tensor(
-      get_sizes_of(idx), get_dtype_of(idx), storage_type, memory_layout);
+  return add_tensor(sizes_of(idx), dtype_of(idx), storage_type, memory_layout);
 }
 
 ValueRef ComputeGraph::add_tensor_like(
     const ValueRef idx,
     const api::GPUMemoryLayout memory_layout) {
-  return add_tensor(get_sizes_of(idx), get_dtype_of(idx), memory_layout);
+  return add_tensor(sizes_of(idx), dtype_of(idx), memory_layout);
 }
 
 ValueRef ComputeGraph::add_tensor(
@@ -300,6 +272,33 @@ SharedObject& ComputeGraph::get_shared_object(const int64_t idx) {
   return shared_objects_.at(idx);
 }
 
+void ComputeGraph::update_descriptor_counts(
+    const api::ShaderInfo& shader_info,
+    bool execute) {
+  api::DescriptorPoolConfig* config =
+      execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_;
+
+  config->descriptor_pool_max_sets += 1;
+  for (const VkDescriptorType arg_type : shader_info.kernel_layout) {
+    switch (arg_type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+        config->descriptor_uniform_buffer_count += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+        config->descriptor_storage_buffer_count += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+        config->descriptor_combined_sampler_count += 1;
+        break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+        config->descriptor_storage_image_count += 1;
+        break;
+      default:
+        VK_THROW("Unsupported descriptor type!");
+    }
+  }
+}
+
 void ComputeGraph::copy_into_staging(
     const ValueRef idx,
     const void* data,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index fbb49f47998..6a7c9e3f424 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -123,9 +123,17 @@ class ComputeGraph final {
     return outputs_;
   }
 
-  void update_descriptor_counts(
-      const api::ShaderInfo& shader_info,
-      bool execute);
+  inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
+    return prepack_nodes_;
+  }
+
+  inline std::vector<std::unique_ptr<ExecuteNode>>& execute_nodes() {
+    return execute_nodes_;
+  }
+
+  //
+  // Value Extraction
+  //
 
 #define GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ptr_type, short_name, type_name) \
   inline ptr_type get_##short_name(const ValueRef idx) {                   \
@@ -168,9 +176,33 @@ class ComputeGraph final {
     return values_.at(idx).type();
   }
 
-  std::vector<int64_t> get_sizes_of(ValueRef idx);
+  // Get Tensor Property
+
+  std::vector<int64_t> sizes_of(const ValueRef idx) const;
+
+  api::ScalarType dtype_of(const ValueRef idx) const;
+
+  inline api::utils::uvec3 extents_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().extents();
+  }
+
+  inline api::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().gpu_memory_layout();
+  }
 
-  api::ScalarType get_dtype_of(ValueRef idx);
+  inline api::BufferBindInfo sizes_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().sizes_ubo();
+  }
+
+  inline api::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().texture_limits_ubo();
+  }
+
+  inline api::BufferBindInfo packed_dim_meta_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().packed_dim_meta_ubo();
+  }
+
+  // Scalar Value Extraction
 
   template <typename T>
   T extract_scalar(const ValueRef idx) {
@@ -200,14 +232,6 @@ class ComputeGraph final {
     return values_.at(idx).toString();
   }
 
-  inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
-    return prepack_nodes_;
-  }
-
-  inline std::vector<std::unique_ptr<ExecuteNode>>& execute_nodes() {
-    return execute_nodes_;
-  }
-
   //
   // Utility functions
   //
@@ -233,13 +257,6 @@ class ComputeGraph final {
   api::GPUMemoryLayout suggested_memory_layout(
       const std::vector<int64_t>& sizes);
 
-  /*
-   * Returns the memory layout of a Tensor value at the specified index.
-   */
-  inline api::GPUMemoryLayout memory_layout_of(ValueRef idx) {
-    return get_tensor(idx)->gpu_memory_layout();
-  }
-
   //
   // Graph Building
   //
@@ -367,6 +384,10 @@ class ComputeGraph final {
   // Graph Preparation
   //
 
+  void update_descriptor_counts(
+      const api::ShaderInfo& shader_info,
+      bool execute);
+
   void prepare();
 
   //
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
index 2e5da86a723..2b3da29dde9 100644
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ b/backends/vulkan/runtime/graph/containers/Value.h
@@ -230,6 +230,14 @@ struct Value final {
         tag,                                                \
         " instead.");                                       \
     return payload.member_name;                             \
+  }                                                         \
+  inline const type& toConst##type_name() const {           \
+    VK_CHECK_COND(                                          \
+        is##type_name(),                                    \
+        "Expected value to have type " #type_name ", got ", \
+        tag,                                                \
+        " instead.");                                       \
+    return payload.member_name;                             \
   }
 
   SUPPORT_TRIVIALLY_MOVEABLE_TYPE(vTensor, Tensor, TypeTag::TENSOR, as_tensor);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
new file mode 100644
index 00000000000..abdbe24d223
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+#include "matmul.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
+layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
+layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
+layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
+  ivec3 self_sizes;
+};
+
+layout(set = 0, binding = 7) uniform PRECISION restrict AddmmParams {
+  float alpha;
+  float beta;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  vec4 texel = vec4(0);
+  ivec3 mat1_pos = ivec3(0, pos.y, pos.z);
+
+  $if MAT1_PACKING == "W_packed":
+    $if MAT2_PACKING == "H_packed":
+      ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
+      texel = matmul_naive_W_packed_H_packed(
+          im_mat1,
+          im_mat2,
+          mat1_pos,
+          mat2_pos,
+          in_sizes[0]);
+    $elif MAT2_PACKING == "W_packed":
+      ivec3 mat2_pos = ivec3(pos.x, 0, pos.z);
+      texel = matmul_naive_W_packed_W_packed(
+          im_mat1,
+          im_mat2,
+          mat1_pos,
+          mat2_pos,
+          in_sizes[0]);
+    $else:
+      $raise Exception("Unsupported value for MAT2_PACKING")
+  $else:
+    $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING")
+
+  vec4 self_texel = get_texel_W_packed(
+      im_self,
+      pos,
+      self_sizes.x == 1,
+      self_sizes.y == 1);
+
+  texel = beta * self_texel + alpha * texel;
+  imageStore(im_out, pos, texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
new file mode 100644
index 00000000000..6861b312d5f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+addmm_naive:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+    MAT1_PACKING: W_packed
+    MAT2_PACKING: H_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: addmm_naive_W_packed_H_packed
+    - NAME: addmm_naive_W_packed_W_packed
+      MAT2_PACKING: W_packed
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
new file mode 100644
index 00000000000..2830a34290f
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+#include "matmul.h"
+
+// addmm will have additional arguments compared to regular mm
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
+layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
+layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
+layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes {
+  ivec4 out_sizes;
+};
+
+layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
+  ivec4 self_sizes;
+};
+
+layout(set = 0, binding = 7) uniform PRECISION restrict PackedDimMeta {
+  int packed_dim_size;
+  int packed_dim_size_padded;
+  int packed_dim_texel_len;
+  int packed_dim_padding;
+};
+
+layout(set = 0, binding = 8) uniform PRECISION restrict Params {
+  float alpha;
+  float beta;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  FloatMatrix results = matmul_partial_4x4(
+      im_mat1,
+      im_mat2,
+      pos,
+      out_sizes[2],
+      packed_dim_texel_len,
+      packed_dim_padding);
+
+  for (int idx_c = 0; idx_c < FOUR; idx_c++) {
+    for (int idx_r = 0; idx_r < FOUR; idx_r++) {
+      const ivec3 out_pos =
+          ivec3(idx_r + FOUR * pos.x, idx_c + FOUR * pos.y, pos.z);
+
+      vec4 self_texel = get_texel_C_packed(
+          im_self,
+          out_pos,
+          self_sizes.x == 1,
+          self_sizes.y == 1);
+      results.data[idx_c][idx_r][0] = beta * self_texel.x + alpha * results.data[idx_c][idx_r][0];
+
+      // results is in transposed order w.r.t. the desired output
+      imageStore(
+          im_out,
+          out_pos,
+          vec4(
+              results.data[idx_c][idx_r][0],
+              results.data[idx_c][idx_r][1],
+              results.data[idx_c][idx_r][2],
+              results.data[idx_c][idx_r][3]));
+    }
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
new file mode 100644
index 00000000000..53352342a84
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+addmm_optimized:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+    PACKING: C_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: addmm_optimized
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.h b/backends/vulkan/runtime/graph/ops/glsl/matmul.h
index f157828f616..ec00a53a649 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul.h
@@ -21,7 +21,7 @@ vec4 matmul_naive_W_packed_H_packed(
     sampler3D im_mat2,
     ivec3 mat1_pos,
     ivec3 mat2_pos,
-    int width) {
+    const int width) {
   vec4 texel = vec4(0);
   int K = (width + 3) / 4;
 
@@ -47,7 +47,7 @@ vec4 matmul_naive_W_packed_W_packed(
     sampler3D im_mat2,
     ivec3 mat1_pos,
     ivec3 mat2_pos,
-    int width) {
+    const int width) {
   vec4 texel = vec4(0);
   int K = divup4(width);
 
@@ -71,20 +71,20 @@ vec4 matmul_naive_W_packed_W_packed(
 // get texel from self tensor (width_packed) in addmm
 vec4 get_texel_W_packed(
     sampler3D im_self,
-    ivec3 pos,
-    int broadcast_at_width,
-    int broadcast_at_height) {
+    const ivec3 pos,
+    const bool broadcast_at_width,
+    const bool broadcast_at_height) {
   vec4 self_texel;
   // self is of shape {1}
-  if (broadcast_at_width == 1 && broadcast_at_height == 1) {
+  if (broadcast_at_width && broadcast_at_height) {
     self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0).xxxx;
   }
   // self is of shape {*, 1}
-  else if (broadcast_at_width == 1) {
+  else if (broadcast_at_width) {
     self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0).xxxx;
   }
   // self is of shape {1, *}
-  else if (broadcast_at_height == 1) {
+  else if (broadcast_at_height) {
     self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
   } else {
     self_texel = texelFetch(im_self, pos, 0);
@@ -96,20 +96,20 @@ vec4 get_texel_W_packed(
 // get texel from self tensor (channel_packed) in addmm
 vec4 get_texel_C_packed(
     sampler3D im_self,
-    ivec3 pos,
-    int broadcast_at_width,
-    int broadcast_at_height) {
+    const ivec3 pos,
+    const bool broadcast_at_width,
+    const bool broadcast_at_height) {
   vec4 self_texel;
   // self is of shape {1}
-  if (broadcast_at_width == 1 && broadcast_at_height == 1) {
+  if (broadcast_at_width && broadcast_at_height) {
     self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0);
   }
   // self is of shape {*, 1}
-  else if (broadcast_at_width == 1) {
+  else if (broadcast_at_width) {
     self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0);
   }
   // self is of shape {1, *}
-  else if (broadcast_at_height == 1) {
+  else if (broadcast_at_height) {
     self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
   } else {
     self_texel = texelFetch(im_self, pos, 0);
@@ -121,10 +121,10 @@ vec4 get_texel_C_packed(
 FloatMatrix matmul_partial_4x4(
     sampler3D im_mat1,
     sampler3D im_mat2,
-    ivec3 pos,
-    int batch_size,
-    int step_size,
-    int reminder) {
+    const ivec3 pos,
+    const int batch_size,
+    const int K_texel_len,
+    const int packed_dim_padding) {
   FloatMatrix results;
   for (int i = 0; i < FOUR; i++) {
     for (int j = 0; j < FOUR; j++) {
@@ -133,43 +133,43 @@ FloatMatrix matmul_partial_4x4(
       }
     }
   }
-  // read and cache 4x4 tile of im_mat1 (4 adjacent rows)
   vec4 im_mat1_partial_rows[FOUR];
   vec4 im_mat2_partial_cols[FOUR];
 
-  for (int c = 0; c < FOUR; c++) {
-    if (FOUR * pos.z + c >= batch_size) {
+  for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
+    if (FOUR * pos.z + batch_idx >= batch_size) {
       break;
     }
-    for (int j = 0; j < step_size; j++) {
-      for (int k = 0; k < FOUR; k++) {
-        const int pos_y_offset = (FOUR * pos.y) + k;
-        const ivec3 pos_rd = ivec3(j, pos_y_offset, FOUR * pos.z + c);
-        im_mat1_partial_rows[k] = texelFetch(im_mat1, pos_rd, 0);
+    // read and cache 4x4 tile of im_mat1 (4 adjacent rows)
+    for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) {
+      for (int mat1_row = 0; mat1_row < FOUR; mat1_row++) {
+        const int mat1_y = (FOUR * pos.y) + mat1_row;
+        const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, FOUR * pos.z + batch_idx);
+        im_mat1_partial_rows[mat1_row] = texelFetch(im_mat1, mat1_pos, 0);
         // set the value out of the boundary to be 0
-        if (j == step_size - 1 && reminder > 0) {
-          for (int kk = 0; kk < 4 - reminder; kk++) {
-            im_mat1_partial_rows[k][3 - kk] = 0;
+        if (mat1_x == K_texel_len - 1 && packed_dim_padding > 0) {
+          for (int kk = 0; kk < packed_dim_padding; kk++) {
+            im_mat1_partial_rows[mat1_row][3 - kk] = 0;
           }
         }
       }
       // read and cache 4x4 tile of im_mat2 (4 adjacent columns)
-      for (int k = 0; k < FOUR; k++) {
-        const int pos_x_offset = (FOUR * pos.x) + k;
-        const ivec3 pos_rd = ivec3(pos_x_offset, j, FOUR * pos.z + c);
-        im_mat2_partial_cols[k] = texelFetch(im_mat2, pos_rd, 0);
+      for (int mat2_col = 0; mat2_col < FOUR; mat2_col++) {
+        const int mat2_x = (FOUR * pos.x) + mat2_col;
+        const ivec3 pos_rd = ivec3(mat2_x, mat1_x, FOUR * pos.z + batch_idx);
+        im_mat2_partial_cols[mat2_col] = texelFetch(im_mat2, pos_rd, 0);
         // set the value out of the boundary to be 0
-        if (j == step_size - 1 && reminder > 0) {
-          for (int kk = 0; kk < 4 - reminder; kk++) {
-            im_mat2_partial_cols[k][3 - kk] = 0;
+        if (mat1_x == K_texel_len - 1 && packed_dim_padding > 0) {
+          for (int kk = 0; kk < packed_dim_padding; kk++) {
+            im_mat2_partial_cols[mat2_col][3 - kk] = 0;
           }
         }
       }
       // perform partial dot products and add partial result to results
-      for (int idx_r = 0; idx_r < FOUR; idx_r++) {
-        for (int idx_c = 0; idx_c < FOUR; idx_c++) {
-          results.data[idx_r][idx_c][c] +=
-              dot(im_mat1_partial_rows[idx_r], im_mat2_partial_cols[idx_c]);
+      for (int out_row = 0; out_row < FOUR; out_row++) {
+        for (int out_col = 0; out_col < FOUR; out_col++) {
+          results.data[out_row][out_col][batch_idx] +=
+              dot(im_mat1_partial_rows[out_row], im_mat2_partial_cols[out_col]);
         }
       }
     }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
index cb8371cb5df..d7e4395d04f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
@@ -13,40 +13,17 @@
 #include "indexing_utils.h"
 #include "matmul.h"
 
-$if IS_ADDMM:
-  // addmm will have additional arguments compared to regular mm
-  layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-  layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-  layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-  layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
+layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
+layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
 
-  layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-    ivec3 out_limits;
-  };
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
 
-  layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-    ivec4 in_sizes;
-  };
-
-  layout(set = 0, binding = 6) uniform PRECISION restrict AddmmParams {
-    int broadcast_at_width;
-    int broadcast_at_height;
-    float alpha;
-    float beta;
-  };
-$else:
-  // define original matmul_naive arguments
-  layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-  layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-  layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-
-  layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-    ivec3 out_limits;
-  };
-
-  layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
-    ivec4 in_sizes;
-  };
+layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
+  ivec4 in_sizes;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -61,20 +38,26 @@ void main() {
   ivec3 mat1_pos = ivec3(0, pos.y, pos.z);
 
   $if MAT1_PACKING == "W_packed":
-      $if MAT2_PACKING == "H_packed":
-        ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
-        texel = matmul_naive_W_packed_H_packed(im_mat1, im_mat2, mat1_pos, mat2_pos, in_sizes[0]);
-      $elif MAT2_PACKING == "W_packed":
-        ivec3 mat2_pos = ivec3(pos.x, 0, pos.z);
-        texel = matmul_naive_W_packed_W_packed(im_mat1, im_mat2, mat1_pos, mat2_pos, in_sizes[0]);
-      $else:
-        $raise Exception("Unsupported value for MAT2_PACKING")
+    $if MAT2_PACKING == "H_packed":
+      ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
+      texel = matmul_naive_W_packed_H_packed(
+          im_mat1,
+          im_mat2,
+          mat1_pos,
+          mat2_pos,
+          in_sizes[0]);
+    $elif MAT2_PACKING == "W_packed":
+      ivec3 mat2_pos = ivec3(pos.x, 0, pos.z);
+      texel = matmul_naive_W_packed_W_packed(
+          im_mat1,
+          im_mat2,
+          mat1_pos,
+          mat2_pos,
+          in_sizes[0]);
+    $else:
+      $raise Exception("Unsupported value for MAT2_PACKING")
   $else:
     $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING")
 
-  $if IS_ADDMM:
-    vec4 self_texel = get_texel_W_packed(im_self, pos, broadcast_at_width, broadcast_at_height);
-    texel = beta * self_texel + alpha * texel;
-
   imageStore(im_out, pos, texel);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
index 32cff0cf09e..727e8b361d8 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
@@ -10,17 +10,11 @@ matmul_naive:
     NDIM: 3
     MAT1_PACKING: W_packed
     MAT2_PACKING: H_packed
-    IS_ADDMM: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
     - NAME: matmul_naive_W_packed_H_packed
-    - NAME: addmm_naive_W_packed_H_packed
-      IS_ADDMM: true
     - NAME: matmul_naive_W_packed_W_packed
       MAT2_PACKING: W_packed
-    - NAME: addmm_naive_W_packed_W_packed
-      MAT2_PACKING: W_packed
-      IS_ADDMM: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
index b9f62cc6593..dd9c57416d2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
@@ -13,56 +13,24 @@
 #include "indexing_utils.h"
 #include "matmul.h"
 
-$if IS_ADDMM:
-  // addmm will have additional arguments compared to regular mm
-  layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-  layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-  layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-  layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self;
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
+layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
+layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
 
-  layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-    ivec3 out_limits;
-  };
+layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
 
-  layout(set = 0, binding = 5) uniform PRECISION restrict StepSize {
-    int step_size;
-  };
+layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
+  ivec4 out_sizes;
+};
 
-  layout(set = 0, binding = 6) uniform PRECISION restrict Reminder {
-    int reminder;
-  };
-
-  layout(set = 0, binding = 7) uniform PRECISION restrict BatchSize {
-    int batch_size;
-  };
-
-  layout(set = 0, binding = 8) uniform PRECISION restrict AddmmParams {
-    int broadcast_at_width;
-    int broadcast_at_height;
-    float alpha;
-    float beta;
-  };
-$else:
-  // define original matmul_optimized arguments
-  layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-  layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-  layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-
-  layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-    ivec3 out_limits;
-  };
-
-  layout(set = 0, binding = 4) uniform PRECISION restrict StepSize {
-    int step_size;
-  };
-
-  layout(set = 0, binding = 5) uniform PRECISION restrict Reminder {
-    int reminder;
-  };
-
-  layout(set = 0, binding = 6) uniform PRECISION restrict BatchSize {
-    int batch_size;
-  };
+layout(set = 0, binding = 5) uniform PRECISION restrict PackedDimMeta {
+  int packed_dim_size;
+  int packed_dim_size_padded;
+  int packed_dim_texel_len;
+  int packed_dim_padding;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -73,15 +41,18 @@ void main() {
     return;
   }
 
-  FloatMatrix results = matmul_partial_4x4(im_mat1, im_mat2, pos, batch_size, step_size, reminder);
+  FloatMatrix results = matmul_partial_4x4(
+      im_mat1,
+      im_mat2,
+      pos,
+      out_sizes[2],
+      packed_dim_texel_len,
+      packed_dim_padding);
 
   for (int idx_c = 0; idx_c < FOUR; idx_c++) {
     for (int idx_r = 0; idx_r < FOUR; idx_r++) {
       const ivec3 out_pos =
           ivec3(idx_r + FOUR * pos.x, idx_c + FOUR * pos.y, pos.z);
-      $if IS_ADDMM:
-        vec4 self_texel = get_texel_C_packed(im_self, out_pos, broadcast_at_width, broadcast_at_height);
-        results.data[idx_c][idx_r][0] = beta * self_texel.x + alpha * results.data[idx_c][idx_r][0];
 
       // results is in transposed order w.r.t. the desired output
       imageStore(
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
index 250d2b1a5b9..7cec20e167c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
@@ -9,12 +9,9 @@ matmul_optimized:
     DTYPE: float
     NDIM: 3
     PACKING: C_packed
-    IS_ADDMM: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
     - NAME: matmul_optimized
-    - NAME: addmm_optimized
-      IS_ADDMM: true
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 4221f6f373e..d457f637d47 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -90,11 +90,11 @@ ValueRef prepack_biases(
     const bool transposed,
     const api::StorageType storage_type,
     const api::GPUMemoryLayout memory_layout) {
-  auto sizes = graph.get_sizes_of(weight);
+  auto sizes = graph.sizes_of(weight);
   const int64_t out_channels = transposed ? sizes.at(1) : sizes.at(0);
 
   ValueRef v = graph.add_tensor(
-      {out_channels}, graph.get_dtype_of(weight), storage_type, memory_layout);
+      {out_channels}, graph.dtype_of(weight), storage_type, memory_layout);
   vTensorPtr t = graph.get_tensor(v);
 
   api::ShaderInfo shader = get_nchw_to_image_shader(*t);
@@ -193,14 +193,11 @@ ValueRef prepack_weights(
     ComputeGraph& graph,
     const ValueRef vref,
     const Conv2dMethod method) {
-  const auto original_sizes = graph.get_sizes_of(vref);
+  const auto original_sizes = graph.sizes_of(vref);
   const auto final_sizes = get_final_sizes(original_sizes, method);
 
   ValueRef v = graph.add_tensor(
-      final_sizes,
-      graph.get_dtype_of(vref),
-      api::kTexture2D,
-      api::kChannelsPacked);
+      final_sizes, graph.dtype_of(vref), api::kTexture2D, api::kChannelsPacked);
   vTensorPtr t = graph.get_tensor(v);
 
   api::utils::uvec3 global_size = t->extents();
@@ -246,7 +243,7 @@ Conv2dParams create_conv2d_params(
       p.kernel_size.data[1] +
           (p.kernel_size.data[1] - 1) * (p.dilation.data[1] - 1),
   });
-  const auto weight_sizes = graph.get_sizes_of(weight);
+  const auto weight_sizes = graph.sizes_of(weight);
   const int32_t in_group_size =
       api::utils::safe_downcast<int32_t>(api::utils::align_up(
           transposed ? weight_sizes.at(0) : weight_sizes.at(1), INT64_C(4)));
@@ -274,7 +271,7 @@ Conv2dMethod get_conv2d_method(
     const ValueRef weight,
     const int64_t groups,
     const bool transposed) {
-  const auto weight_sizes = graph.get_sizes_of(weight);
+  const auto weight_sizes = graph.sizes_of(weight);
   if (!transposed && weight_sizes.at(0) == groups && weight_sizes.at(1) == 1) {
     return Conv2dMethod::Depthwise;
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
new file mode 100644
index 00000000000..9e4ea7a9ba0
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void check_addmm_args(
+    ComputeGraph& graph,
+    const ValueRef self,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef beta,
+    const ValueRef alpha,
+    const ValueRef out) {
+  (void)alpha;
+  (void)beta;
+
+  std::vector<int64_t> self_sizes = graph.sizes_of(self);
+  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
+  std::vector<int64_t> mat2_sizes = graph.sizes_of(mat2_data);
+
+  VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
+  VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
+
+  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
+
+  VK_CHECK_COND(
+      api::utils::val_at(-1, mat1_sizes) == api::utils::val_at(-2, mat2_sizes));
+
+  if (api::utils::val_at(-1, self_sizes) != 1) {
+    VK_CHECK_COND(
+        api::utils::val_at(-1, self_sizes) ==
+        api::utils::val_at(-1, mat2_sizes));
+  }
+  if (api::utils::val_at(-2, self_sizes) != 1) {
+    VK_CHECK_COND(
+        api::utils::val_at(-2, self_sizes) ==
+        api::utils::val_at(-2, mat1_sizes));
+  }
+}
+
+void resize_addmm_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
+  vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
+  vTensorPtr self = graph->get_tensor(args[1].refs[2]);
+
+  std::vector<int64_t> new_out_sizes(3);
+  if (mat1->sizes().size() == 2) {
+    new_out_sizes.resize(2);
+    new_out_sizes.at(0) = mat1->sizes().at(0);
+    new_out_sizes.at(1) = mat2->sizes().at(1);
+  } else {
+    new_out_sizes.at(0) = mat1->sizes().at(0);
+    new_out_sizes.at(1) = mat1->sizes().at(1);
+    new_out_sizes.at(2) = mat2->sizes().at(2);
+  }
+
+  out->virtual_resize(new_out_sizes);
+}
+
+struct Params final {
+  float alpha;
+  float beta;
+};
+
+void add_addmm_naive_node(
+    ComputeGraph& graph,
+    const ValueRef self,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef beta,
+    const ValueRef alpha,
+    const ValueRef out,
+    const Params& params) {
+  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
+
+  api::utils::uvec3 global_size = graph.extents_of(out);
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("addmm_naive");
+  kernel_name.reserve(kShaderNameReserve);
+  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
+  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{mat1, mat2, self}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(mat1),
+          graph.sizes_ubo(self),
+          graph.create_params_buffer(params),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_addmm_node));
+}
+
+void add_addmm_optimized_node(
+    ComputeGraph& graph,
+    const ValueRef self,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef beta,
+    const ValueRef alpha,
+    const ValueRef out,
+    const Params& params) {
+  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
+
+  // Ensure mat1 is width packed
+  ValueRef mat1_W_packed = graph.add_tensor_like(mat1, api::kWidthPacked);
+  auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
+  viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
+
+  // Ensure mat2 is height packed
+  ValueRef mat2_H_packed = mat2;
+  if (graph.memory_layout_of(mat2) != api::kHeightPacked) {
+    mat2_H_packed = graph.add_tensor_like(mat2, api::kHeightPacked);
+    viewFn(graph, {mat2, graph.add_none(), mat2_H_packed});
+  }
+
+  api::utils::uvec3 global_size =
+      api::utils::divup_vec(graph.extents_of(out), {4, 4, 1});
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("addmm_optimized");
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{mat1_W_packed, mat2_H_packed, self}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(out),
+          graph.sizes_ubo(self),
+          graph.packed_dim_meta_ubo(mat1_W_packed),
+          graph.create_params_buffer(params),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_addmm_node));
+}
+
+void add_addmm_node(
+    ComputeGraph& graph,
+    const ValueRef self,
+    const ValueRef mat1,
+    const ValueRef mat2,
+    const ValueRef beta,
+    const ValueRef alpha,
+    const ValueRef out) {
+  float alpha_val = 1.0f;
+  float beta_val = 1.0f;
+
+  alpha_val = graph.extract_scalar<float>(alpha);
+  beta_val = graph.extract_scalar<float>(beta);
+
+  Params params = {alpha_val, beta_val};
+  if (graph.memory_layout_of(mat1) == api::kChannelsPacked) {
+    add_addmm_optimized_node(graph, self, mat1, mat2, beta, alpha, out, params);
+  } else if (graph.memory_layout_of(mat1) == api::kWidthPacked) {
+    add_addmm_naive_node(graph, self, mat1, mat2, beta, alpha, out, params);
+  } else {
+    VK_THROW("Input should be channel packed or width packed.");
+  }
+}
+
+void addmm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  check_addmm_args(graph, args[0], args[1], args[2], args[3], args[4], args[5]);
+  return add_addmm_node(
+      graph, args[0], args[1], args[2], args[3], args[4], args[5]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.addmm.default, addmm);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index 45a74636ea5..063956ad315 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -18,20 +18,20 @@
 namespace vkcompute {
 
 void check_matmul_args(
-    ComputeGraph& graph,
-    const ValueRef arg1,
-    const ValueRef arg2,
+    const ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
     const ValueRef out) {
-  vTensorPtr t_mat1 = graph.get_tensor(arg1);
-  vTensorPtr t_mat2 = graph.get_tensor(arg2);
-  vTensorPtr t_out = graph.get_tensor(out);
+  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
+  std::vector<int64_t> mat2_sizes = graph.sizes_of(mat2_data);
 
-  VK_CHECK_COND(check_ndim_is(*t_mat1, 2) || check_ndim_is(*t_mat1, 3));
-  VK_CHECK_COND(check_same_ndim(*t_mat1, *t_mat2));
+  VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
+  VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
 
-  VK_CHECK_COND(check_same_memory_layout(*t_mat1, *t_out));
+  VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out));
 
-  VK_CHECK_COND(check_same_sizes_at(*t_mat1, -1, *t_mat2, -2));
+  VK_CHECK_COND(
+      api::utils::val_at(-1, mat1_sizes) == api::utils::val_at(-2, mat2_sizes));
 }
 
 void resize_matmul_node(
@@ -57,324 +57,107 @@ void resize_matmul_node(
   out->virtual_resize(new_out_sizes);
 }
 
-struct AddmmParams final {
-  int broadcast_at_width;
-  int broadcast_at_height;
-  float alpha;
-  float beta;
-};
-
-// TODO: `add_matmul_node` and `add_addmm_node` has lots of duplicated code.
-// We should do refactoring to simplify.
-void add_matmul_node(
+void add_matmul_naive_node(
     ComputeGraph& graph,
     const ValueRef mat1,
-    const ValueRef mat2,
+    const ValueRef mat2_data,
     const ValueRef out) {
-  ValueRef arg1 = mat1;
-  ValueRef arg2 = prepack_if_tensor_ref(graph, mat2, api::kHeightPacked);
-
-  std::vector<int64_t> t_mat1_sizes = graph.get_tensor(arg1)->sizes();
-  std::vector<int64_t> t_mat2_sizes = graph.get_tensor(arg2)->sizes();
-  std::vector<int64_t> out_sizes = graph.get_tensor(out)->sizes();
-  int64_t t_mat1_dim = t_mat1_sizes.size();
-  int64_t out_dim = out_sizes.size();
-
-  check_matmul_args(graph, arg1, arg2, out);
-  auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
-
-  // optimized mm
-  if (graph.memory_layout_of(arg1) == api::kChannelsPacked) {
-    ValueRef t_mat1_width_packed =
-        graph.add_tensor_like(arg1, api::kWidthPacked);
-    viewFn(graph, {arg1, graph.add_none(), t_mat1_width_packed});
-    arg1 = t_mat1_width_packed;
-
-    if (graph.memory_layout_of(arg2) != api::kHeightPacked) {
-      ValueRef t_mat2_height_packed =
-          graph.add_tensor(t_mat2_sizes, api::kFloat, api::kHeightPacked);
-      viewFn(graph, {arg2, graph.add_none(), t_mat2_height_packed});
-      arg2 = t_mat2_height_packed;
-    }
-
-    vTensorPtr t_mat1 = graph.get_tensor(arg1);
-    vTensorPtr t_mat2 = graph.get_tensor(arg2);
-
-    VK_CHECK_COND(check_memory_layout_is(*t_mat1, api::kWidthPacked));
-    VK_CHECK_COND(check_memory_layout_is(*t_mat2, api::kHeightPacked));
-
-    // Step size is the 2d input's width dimension / 4.
-    int32_t step_size =
-        api::utils::div_up(t_mat1_sizes.at(t_mat1_dim - 1), INT64_C(4));
-
-    // reminder is used in shader to detect whether the fetched texel is out of
-    // boundary
-    int32_t reminder = t_mat1_sizes.at(t_mat1_dim - 1) % INT64_C(4);
-
-    int64_t batch_size = 1;
-    if (t_mat1_dim == 3) {
-      batch_size = t_mat1_sizes.at(0);
-    }
-
-    vTensorPtr t_out = graph.get_tensor(out);
-
-    api::utils::uvec3 global_size = {
-        static_cast<unsigned int>(
-            api::utils::div_up(out_sizes.at(t_mat1_dim - 1), INT64_C(4))),
-        static_cast<unsigned int>(
-            api::utils::div_up(out_sizes.at(t_mat1_dim - 2), INT64_C(4))),
-        static_cast<unsigned int>(
-            out_dim == 3 ? api::utils::div_up(out_sizes.at(0), INT64_C(4))
-                         : 1)};
-    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-    std::string kernel_name("matmul_optimized");
-    kernel_name.reserve(kShaderNameReserve);
-
-    add_dtype_suffix(kernel_name, *t_out);
-
-    graph.execute_nodes().emplace_back(new ExecuteNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        // Inputs and Outputs
-        {{out, api::MemoryAccessType::WRITE},
-         {{arg1, arg2}, api::MemoryAccessType::READ}},
-        // Shader params buffers
-        {
-            t_out->texture_limits_ubo(),
-            graph.create_params_buffer(step_size),
-            graph.create_params_buffer(reminder),
-            graph.create_params_buffer(batch_size),
-        },
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_matmul_node));
-  } else if (graph.memory_layout_of(arg1) == api::kWidthPacked) {
-    // native mm
-    if (graph.memory_layout_of(arg2) != api::kHeightPacked) {
-      ValueRef t_mat2_height_packed =
-          graph.add_tensor(t_mat2_sizes, api::kFloat, api::kHeightPacked);
-      viewFn(graph, {arg2, graph.add_none(), t_mat2_height_packed});
-      arg2 = t_mat2_height_packed;
-    }
-
-    vTensorPtr t_mat1 = graph.get_tensor(arg1);
-    vTensorPtr t_mat2 = graph.get_tensor(arg2);
-    vTensorPtr t_out = graph.get_tensor(out);
-
-    VK_CHECK_COND(check_memory_layout_is(*t_mat2, api::kHeightPacked));
-
-    api::utils::uvec3 global_size = t_out->extents();
-    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-    std::string kernel_name("matmul_naive");
-    kernel_name.reserve(kShaderNameReserve);
-    add_memory_layout_suffix(kernel_name, *t_mat1);
-    add_memory_layout_suffix(kernel_name, *t_mat2);
-    add_dtype_suffix(kernel_name, *t_out);
-
-    graph.execute_nodes().emplace_back(new ExecuteNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        // Inputs and Outputs
-        {{out, api::MemoryAccessType::WRITE},
-         {{arg1, arg2}, api::MemoryAccessType::READ}},
-        // Shader params buffers
-        {
-            t_out->texture_limits_ubo(),
-            t_mat1->sizes_ubo(),
-        },
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_matmul_node));
-  } else {
-    VK_THROW("Input should be channel packed or width packed.");
-  }
+  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
+
+  api::utils::uvec3 global_size = graph.extents_of(out);
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("matmul_naive");
+  kernel_name.reserve(kShaderNameReserve);
+  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
+  add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{mat1, mat2}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(mat1),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_matmul_node));
 }
 
-void add_addmm_node(
+void add_matmul_optimized_node(
     ComputeGraph& graph,
-    const ValueRef self,
     const ValueRef mat1,
-    const ValueRef mat2,
-    const ValueRef beta,
-    const ValueRef alpha,
+    const ValueRef mat2_data,
     const ValueRef out) {
-  ValueRef arg1 = prepack_if_tensor_ref(graph, mat1, api::kChannelsPacked);
-  ValueRef arg2 = prepack_if_tensor_ref(graph, mat2, api::kHeightPacked);
-
-  std::vector<int64_t> t_mat1_sizes = graph.get_tensor(arg1)->sizes();
-  std::vector<int64_t> t_mat2_sizes = graph.get_tensor(arg2)->sizes();
-  std::vector<int64_t> out_sizes = graph.get_tensor(out)->sizes();
-  int64_t t_mat1_dim = t_mat1_sizes.size();
-
-  ValueRef self_arg;
-  int broadcast_at_width = 0;
-  int broadcast_at_height = 0;
-  float alpha_val = 1.0f;
-  float beta_val = 1.0f;
-  if (graph.memory_layout_of(arg1) == api::kChannelsPacked) {
-    self_arg = prepack_if_tensor_ref(graph, self, api::kChannelsPacked);
-  } else if (graph.memory_layout_of(arg1) == api::kWidthPacked) {
-    self_arg = prepack_if_tensor_ref(graph, self, api::kWidthPacked);
-  } else {
-    VK_THROW("Input should be channel packed or width packed.");
-  }
-
-  std::vector<int64_t> self_sizes = graph.get_tensor(self_arg)->sizes();
-  int64_t self_dim = self_sizes.size();
-  if (self_sizes.at(self_dim - 1) < out_sizes.at(t_mat1_dim - 1)) {
-    broadcast_at_width = 1;
-  }
-  if (self_dim < t_mat1_dim || self_sizes.at(0) < out_sizes.at(0)) {
-    broadcast_at_height = 1;
-  }
-  alpha_val = graph.extract_scalar<float>(alpha);
-  beta_val = graph.extract_scalar<float>(beta);
+  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
 
-  AddmmParams addmm_params = {
-      broadcast_at_width, broadcast_at_height, alpha_val, beta_val};
-
-  check_matmul_args(graph, arg1, arg2, out);
+  // Ensure mat1 is width packed
+  ValueRef mat1_W_packed = graph.add_tensor_like(mat1, api::kWidthPacked);
   auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
+  viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
 
-  // optimized mm
-  if (graph.memory_layout_of(arg1) == api::kChannelsPacked) {
-    ValueRef t_mat1_width_packed =
-        graph.add_tensor(t_mat1_sizes, api::kFloat, api::kWidthPacked);
-    viewFn(graph, {arg1, graph.add_none(), t_mat1_width_packed});
-    arg1 = t_mat1_width_packed;
-
-    if (graph.memory_layout_of(arg2) != api::kHeightPacked) {
-      ValueRef t_mat2_height_packed =
-          graph.add_tensor(t_mat2_sizes, api::kFloat, api::kHeightPacked);
-      viewFn(graph, {arg2, graph.add_none(), t_mat2_height_packed});
-      arg2 = t_mat2_height_packed;
-    }
-
-    vTensorPtr t_mat1 = graph.get_tensor(arg1);
-    vTensorPtr t_mat2 = graph.get_tensor(arg2);
-
-    VK_CHECK_COND(check_memory_layout_is(*t_mat1, api::kWidthPacked));
-    VK_CHECK_COND(check_memory_layout_is(*t_mat2, api::kHeightPacked));
-
-    // Step size is the 2d input's width dimension / 4.
-    int32_t step_size =
-        api::utils::div_up(t_mat1_sizes.at(t_mat1_dim - 1), INT64_C(4));
-
-    // reminder is used in shader to detect whether the fetched texel is out of
-    // boundary
-    int32_t reminder = t_mat1_sizes.at(t_mat1_dim - 1) % INT64_C(4);
-
-    int64_t batch_size = 1;
-    if (t_mat1_dim == 3) {
-      batch_size = t_mat1_sizes.at(0);
-    }
-
-    vTensorPtr t_out = graph.get_tensor(out);
-    int64_t out_dim = out_sizes.size();
-
-    api::utils::uvec3 global_size = {
-        static_cast<unsigned int>(
-            api::utils::div_up(out_sizes.at(t_mat1_dim - 1), INT64_C(4))),
-        static_cast<unsigned int>(
-            api::utils::div_up(out_sizes.at(t_mat1_dim - 2), INT64_C(4))),
-        static_cast<unsigned int>(
-            out_dim == 3 ? api::utils::div_up(out_sizes.at(0), INT64_C(4))
-                         : 1)};
-    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-    std::string kernel_name("addmm_optimized");
-    kernel_name.reserve(kShaderNameReserve);
-
-    add_dtype_suffix(kernel_name, *t_out);
-
-    graph.execute_nodes().emplace_back(new ExecuteNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        // Inputs and Outputs
-        {{out, api::MemoryAccessType::WRITE},
-         {{arg1, arg2, self_arg}, api::MemoryAccessType::READ}},
-        // Shader params buffers
-        {
-            t_out->texture_limits_ubo(),
-            graph.create_params_buffer(step_size),
-            graph.create_params_buffer(reminder),
-            graph.create_params_buffer(batch_size),
-            graph.create_params_buffer(addmm_params),
-        },
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_matmul_node));
-  } else if (graph.memory_layout_of(arg1) == api::kWidthPacked) {
-    // native mm
-    if (graph.memory_layout_of(arg2) != api::kHeightPacked) {
-      ValueRef t_mat2_height_packed =
-          graph.add_tensor(t_mat2_sizes, api::kFloat, api::kHeightPacked);
-      viewFn(graph, {arg2, graph.add_none(), t_mat2_height_packed});
-      arg2 = t_mat2_height_packed;
-    }
-
-    vTensorPtr t_mat1 = graph.get_tensor(arg1);
-    vTensorPtr t_mat2 = graph.get_tensor(arg2);
-    vTensorPtr t_out = graph.get_tensor(out);
-
-    VK_CHECK_COND(check_memory_layout_is(*t_mat2, api::kHeightPacked));
-
-    api::utils::uvec3 global_size = t_out->extents();
-    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+  // Ensure mat2 to height packed
+  ValueRef mat2_H_packed = mat2;
+  if (graph.memory_layout_of(mat2) != api::kHeightPacked) {
+    mat2_H_packed = graph.add_tensor_like(mat2, api::kHeightPacked);
+    viewFn(graph, {mat2, graph.add_none(), mat2_H_packed});
+  }
 
-    std::string kernel_name("addmm_naive");
-    kernel_name.reserve(kShaderNameReserve);
-    add_memory_layout_suffix(kernel_name, *t_mat1);
-    add_memory_layout_suffix(kernel_name, *t_mat2);
-    add_dtype_suffix(kernel_name, *t_out);
+  api::utils::uvec3 global_size =
+      api::utils::divup_vec(graph.extents_of(out), {4, 4, 1});
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("matmul_optimized");
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{mat1_W_packed, mat2_H_packed}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          graph.texture_limits_ubo(out),
+          graph.sizes_ubo(out),
+          graph.packed_dim_meta_ubo(mat1_W_packed),
+      },
+      // Specialization Constants
+      {}));
+}
 
-    graph.execute_nodes().emplace_back(new ExecuteNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        // Inputs and Outputs
-        {{out, api::MemoryAccessType::WRITE},
-         {{arg1, arg2, self_arg}, api::MemoryAccessType::READ}},
-        // Shader params buffers
-        {
-            t_out->texture_limits_ubo(),
-            t_mat1->sizes_ubo(),
-            graph.create_params_buffer(addmm_params),
-        },
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_matmul_node));
+void add_matmul_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef out) {
+  if (graph.memory_layout_of(mat1) == api::kChannelsPacked) {
+    add_matmul_optimized_node(graph, mat1, mat2_data, out);
+  } else if (graph.memory_layout_of(mat1) == api::kWidthPacked) {
+    add_matmul_naive_node(graph, mat1, mat2_data, out);
   } else {
     VK_THROW("Input should be channel packed or width packed.");
   }
 }
 
-void addmm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_addmm_node(
-      graph, args[0], args[1], args[2], args[3], args[4], args[5]);
-}
-
 void matmul(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  check_matmul_args(graph, args[0], args[1], args[2]);
   return add_matmul_node(graph, args[0], args[1], args[2]);
 }
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.mm.default, matmul);
   VK_REGISTER_OP(aten.bmm.default, matmul);
-  VK_REGISTER_OP(aten.addmm.default, addmm);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
index 0bca0b4f055..41e8f0fb02f 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
@@ -10,15 +10,15 @@
 
 namespace vkcompute {
 
-void add_dtype_suffix(std::string& kernel_name, const vTensor& tensor) {
-  switch (tensor.image().format()) {
-    case VK_FORMAT_R32G32B32A32_SFLOAT:
+void add_dtype_suffix(std::string& kernel_name, const api::ScalarType dtype) {
+  switch (dtype) {
+    case api::kFloat:
       kernel_name += "_float";
       break;
-    case VK_FORMAT_R16G16B16A16_SFLOAT:
+    case api::kHalf:
       kernel_name += "_half";
       break;
-    case VK_FORMAT_R32G32B32A32_SINT:
+    case api::kInt:
       kernel_name += "_int";
       break;
     default:
@@ -26,6 +26,10 @@ void add_dtype_suffix(std::string& kernel_name, const vTensor& tensor) {
   }
 }
 
+void add_dtype_suffix(std::string& kernel_name, const vTensor& tensor) {
+  return add_dtype_suffix(kernel_name, tensor.dtype());
+}
+
 void add_ndim_suffix(std::string& kernel_name, const vTensor& tensor) {
   switch (tensor.storage_type()) {
     case api::kTexture3D:
@@ -39,8 +43,10 @@ void add_ndim_suffix(std::string& kernel_name, const vTensor& tensor) {
   }
 }
 
-void add_memory_layout_suffix(std::string& kernel_name, const vTensor& tensor) {
-  switch (tensor.gpu_memory_layout()) {
+void add_memory_layout_suffix(
+    std::string& kernel_name,
+    api::GPUMemoryLayout layout) {
+  switch (layout) {
     case api::kChannelsPacked:
       kernel_name += "_C_packed";
       break;
@@ -55,4 +61,8 @@ void add_memory_layout_suffix(std::string& kernel_name, const vTensor& tensor) {
   }
 }
 
+void add_memory_layout_suffix(std::string& kernel_name, const vTensor& tensor) {
+  return add_memory_layout_suffix(kernel_name, tensor.gpu_memory_layout());
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
index a784a4acb4c..bf97efcd4ec 100644
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
+++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
@@ -16,10 +16,15 @@ namespace vkcompute {
 
 constexpr size_t kShaderNameReserve = 64u;
 
+void add_dtype_suffix(std::string& kernel_name, const api::ScalarType dtype);
 void add_dtype_suffix(std::string& kernel_name, const vTensor& tensor);
 
+void add_ndim_suffix(std::string& kernel_name, const size_t ndim);
 void add_ndim_suffix(std::string& kernel_name, const vTensor& tensor);
 
+void add_memory_layout_suffix(
+    std::string& kernel_name,
+    const api::GPUMemoryLayout layout);
 void add_memory_layout_suffix(std::string& kernel_name, const vTensor& tensor);
 
 } // namespace vkcompute

From e288039789a51a5f0760e44d4a0e05a3dd30b81b Mon Sep 17 00:00:00 2001
From: Lucy Qiu <lfq@meta.com>
Date: Mon, 13 May 2024 11:21:00 -0700
Subject: [PATCH 39/62] Comment out memory planning log (#3581)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/3581

Reviewed By: mergennachin

Differential Revision: D57187017

fbshipit-source-id: 1a73bccbd322f0c0b3d87b627ec5f2221b6c3396
---
 runtime/executor/method.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 1184eb0d3c8..3ac8e4897e4 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -933,10 +933,15 @@ Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
       InvalidState,
       "Outputs can not be retrieved until method has been initialized.");
 
-  ET_CHECK_OR_RETURN_ERROR(
-      !pre_allocated_output_,
-      InvalidState,
-      "Overriding output data pointer allocated by memory plan is not allowed.");
+  // ET_CHECK_OR_RETURN_ERROR(
+  //     !pre_allocated_output_,
+  //     InvalidState,
+  //     "Overriding output data pointer allocated by memory plan is not
+  //     allowed.");
+  // TODO(T188740925): for now, return error without logs.
+  if (pre_allocated_output_) {
+    return ::torch::executor::Error::InvalidState;
+  }
 
   // Check the args
   ET_CHECK_OR_RETURN_ERROR(

From 87d828aabe88fd40e33fc82e03a063aec6f0b4fc Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Mon, 13 May 2024 11:32:15 -0700
Subject: [PATCH 40/62] don't partition max pool with ceil mode (#3578)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3578

XNNPACK doesn't support max pooling with ceil mode, so we should not be partitioning these nodes where ceil mode is True

Resolving this issue:
https://github.com/pytorch/executorch/issues/3567

Reviewed By: mergennachin, digantdesai

Differential Revision: D57228128

fbshipit-source-id: ee57a783d314d69ebef57f0e1707c0d038582a31
---
 .../xnnpack/partition/xnnpack_partitioner.py  |  5 +++
 backends/xnnpack/test/ops/maxpool2d.py        | 36 +++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py
index f5b11a631a3..6d483e4ea00 100644
--- a/backends/xnnpack/partition/xnnpack_partitioner.py
+++ b/backends/xnnpack/partition/xnnpack_partitioner.py
@@ -166,6 +166,8 @@ def _check_outputs_are_valid_dtypes(self, node, valid_dtypes):
         return True
 
     def check_node_has_valid_dtype(self, node):
+        # max_pool2d_with_indicies returns indicies which is int64
+        # this is supportable within XNNPACK
         if node.target in {exir_ops.edge.aten.max_pool2d_with_indices.default}:
             return True
 
@@ -268,13 +270,16 @@ def maxpool2d_with_indices(
     ) -> bool:
         """
         Only if the first output value is consumed in the graph
+        and it is not in ceil mode
         """
         users = list(node.users.keys())
+        is_ceil_mode = len(node.args) >= 6 and node.args[5]
         return (
             True
             if len(users) == 1
             and users[0].target == operator.getitem
             and users[0].args[1] == 0
+            and not is_ceil_mode
             else False
         )
 
diff --git a/backends/xnnpack/test/ops/maxpool2d.py b/backends/xnnpack/test/ops/maxpool2d.py
index e919fc6e776..889c29a5f38 100644
--- a/backends/xnnpack/test/ops/maxpool2d.py
+++ b/backends/xnnpack/test/ops/maxpool2d.py
@@ -38,6 +38,14 @@ def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1):
         def forward(self, x):
             return self.max_pool2d_module(x)[1]
 
+    class MaxPool2dUnsupportedCeilMode(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.max_pool2d_module = torch.nn.MaxPool2d(2, stride=2, ceil_mode=True)
+
+        def forward(self, x):
+            return self.max_pool2d_module(x)
+
     def _test_maxpool2d(self, inputs):
         """
         Note that the export process generates aten.max_pool2d_with_indices. The remove_getitem_op
@@ -99,6 +107,34 @@ def test_fp32_maxpool2d_unsupported(self):
             )
         )
 
+    def test_fp32_maxpool2d_unsupported_ceilmode(self):
+        """
+        MaxPool2d with ceil mode is not generally supported (see maxpool2d constraint).
+        """
+        inputs = (torch.randn(1, 32, 23, 23),)
+        (
+            Tester(self.MaxPool2dUnsupportedCeilMode(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.max_pool2d_with_indices.default": 1})
+            .to_edge()
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default": 1
+                }
+            )
+            .partition()
+            # We expect it not be be delegated.
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 0})
+            .check_count(
+                {
+                    "executorch_exir_dialects_edge__ops_aten_max_pool2d_with_indices_default": 1
+                }
+            )
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
     def test_qs8_maxpool2d(self):
         class MaxPool(torch.nn.Module):
             def __init__(self, maxpool_params):

From 4b5e434c4a58fe49fbf9659ae5078182c0e84217 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@meta.com>
Date: Mon, 13 May 2024 14:23:38 -0700
Subject: [PATCH 41/62] Reuse the existing clone of the ios toolchain for Core
 ML. (#3575)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3575

.

Reviewed By: mergennachin

Differential Revision: D57225246

fbshipit-source-id: c0145c4dcabd8aff8fbb38d51ff6e4ddcc87f48c
---
 backends/apple/coreml/scripts/build_tests.sh         |  2 +-
 .../apple/coreml/scripts/install_requirements.sh     | 12 ++----------
 .../apple/coreml/scripts/build_executor_runner.sh    |  2 +-
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/backends/apple/coreml/scripts/build_tests.sh b/backends/apple/coreml/scripts/build_tests.sh
index 730ba0839db..911c6cd4e10 100755
--- a/backends/apple/coreml/scripts/build_tests.sh
+++ b/backends/apple/coreml/scripts/build_tests.sh
@@ -13,7 +13,7 @@ SCRIPT_DIR_PATH="$(
 EXECUTORCH_ROOT_PATH=$(realpath "$SCRIPT_DIR_PATH/../../../../")
 COREML_DIR_PATH="$EXECUTORCH_ROOT_PATH/backends/apple/coreml"
 PROTOBUF_DIR_PATH="$COREML_DIR_PATH/third-party/coremltools/deps/protobuf"
-IOS_TOOLCHAIN_PATH="$COREML_DIR_PATH/third-party/ios-cmake/ios.toolchain.cmake"
+IOS_TOOLCHAIN_PATH="$EXECUTORCH_ROOT_PATH/third-party/ios-cmake/ios.toolchain.cmake"
 CMAKE_EXECUTORCH_BUILD_DIR_PATH="$COREML_DIR_PATH/executorch-cmake-out"
 CMAKE_PROTOBUF_BUILD_DIR_PATH="$COREML_DIR_PATH/protobuf-cmake-out"
 LIBRARIES_DIR_PATH="$COREML_DIR_PATH/runtime/libraries"
diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
index b48ac7bfb69..baf731452e9 100755
--- a/backends/apple/coreml/scripts/install_requirements.sh
+++ b/backends/apple/coreml/scripts/install_requirements.sh
@@ -53,14 +53,6 @@ if [ $STATUS -ne 0 ]; then
     exit 1
 fi
 
-echo "${green}ExecuTorch: Cloning ios-cmake."
-git clone https://github.com/leetal/ios-cmake.git "$COREML_DIR_PATH/third-party/ios-cmake"
-STATUS=$?
-if [ $STATUS -ne 0 ]; then
-    echo "${red}ExecuTorch: Failed to clone ios-cmake."
-    exit 1
-fi
-
 echo "${green}ExecuTorch: Cloning nlohmann."
 git clone https://github.com/nlohmann/json.git "$COREML_DIR_PATH/third-party/nlohmann_json"
 STATUS=$?
@@ -72,5 +64,5 @@ fi
 sh "$COREML_DIR_PATH/scripts/install_inmemoryfs.sh"
 
 echo "${green}ExecuTorch: Copying protobuf files."
-mkdir -p "$COREML_DIR_PATH/runtime/sdk/format/" 
-cp -rf "$PROTOBUF_FILES_DIR_PATH" "$COREML_DIR_PATH/runtime/sdk/format/" 
+mkdir -p "$COREML_DIR_PATH/runtime/sdk/format/"
+cp -rf "$PROTOBUF_FILES_DIR_PATH" "$COREML_DIR_PATH/runtime/sdk/format/"
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index 347f3b4474f..86ff5f6edb9 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -13,7 +13,7 @@ SCRIPT_DIR_PATH="$(
 EXECUTORCH_ROOT_PATH=$(realpath "$SCRIPT_DIR_PATH/../../../../")
 COREML_DIR_PATH="$EXECUTORCH_ROOT_PATH/backends/apple/coreml"
 EXAMPLES_COREML_DIR_PATH="$EXECUTORCH_ROOT_PATH/examples/apple/coreml"
-IOS_TOOLCHAIN_PATH="$COREML_DIR_PATH/third-party/ios-cmake/ios.toolchain.cmake"
+IOS_TOOLCHAIN_PATH="$EXECUTORCH_ROOT_PATH/third-party/ios-cmake/ios.toolchain.cmake"
 CMAKE_BUILD_DIR_PATH="$EXAMPLES_COREML_DIR_PATH/cmake-out"
 LIBRARIES_DIR_PATH="$EXAMPLES_COREML_DIR_PATH/executor_runner/libraries"
 INCLUDE_DIR_PATH="$EXAMPLES_COREML_DIR_PATH/executor_runner/include"

From ea9647f470cf2cd5bda2b034cbf9ae9896f37039 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <dvieriu@apple.com>
Date: Mon, 13 May 2024 14:23:59 -0700
Subject: [PATCH 42/62] Add support for slice_scatter; enable index_put (#3399)

Summary:
Summary of changes:
- support for scatter slice
- enable index put

With whole model delegation, I am seeing following crash in llama2:
```
in _verify_exported_program_signature
    raise SpecViolationError(
torch._export.verifier.SpecViolationError: Buffer output getitem_1 does not point to a buffer that exists.
Dict of buffers that are mutated, in order: {'getitem_1': 'layers_0_attention_SDPA_kv_cache_k_cache', 'getitem': 'layers_0_attention_SDPA_kv_cache_v_cache', 'getitem_3': 'layers_1_attention_SDPA_kv_cache_k_cache', 'getitem_2': 'layers_1_attention_SDPA_kv_cache_v_cache', 'getitem_5': 'layers_2_attention_SDPA_kv_cache_k_cache', 'getitem_4': 'layers_2_attention_SDPA_kv_cache_v_cache', 'getitem_7': 'layers_3_attention_SDPA_kv_cache_k_cache', 'getitem_6': 'layers_3_attention_SDPA_kv_cache_v_cache', 'getitem_9': 'layers_4_attention_SDPA_kv_cache_k_cache', 'getitem_8': 'layers_4_attention_SDPA_kv_cache_v_cache'}
Buffer nodes available: []
```

Commands to lower llama2 to MPS:
- python -m examples.models.llama2.export_llama  -kv  --mps
- python3 -m examples.apple.mps.scripts.mps_example --model_name="llama2"

Pull Request resolved: https://github.com/pytorch/executorch/pull/3399

Reviewed By: shoumikhin

Differential Revision: D57293487

Pulled By: cccclai

fbshipit-source-id: a7ea392dc3c14b3538416b492d512aec71a0524e
---
 backends/apple/mps/operators/indexing_ops.py  | 88 ++++++++++++++++++-
 backends/apple/mps/operators/node_visitor.py  | 33 +++++++
 backends/apple/mps/runtime/MPSGraphBuilder.h  |  1 +
 .../mps/runtime/operations/IndexingOps.mm     | 24 +++++
 .../mps/runtime/operations/OperationUtils.mm  |  1 +
 .../mps/serialization/mps_graph_schema.py     |  8 ++
 backends/apple/mps/serialization/schema.fbs   |  9 ++
 .../apple/mps/test/test_mps_indexing_ops.py   | 41 ++++++++-
 8 files changed, 200 insertions(+), 5 deletions(-)

diff --git a/backends/apple/mps/operators/indexing_ops.py b/backends/apple/mps/operators/indexing_ops.py
index 690549973a4..02506e11823 100644
--- a/backends/apple/mps/operators/indexing_ops.py
+++ b/backends/apple/mps/operators/indexing_ops.py
@@ -16,6 +16,7 @@
     MPSIndexPut,
     MPSIndexSelect,
     MPSIndexTensor,
+    MPSScatter,
 )
 from executorch.backends.apple.mps.utils.mps_utils import get_input_node
 from executorch.backends.transforms import get_shape
@@ -65,12 +66,9 @@ def define_node(
         mps_graph.mps_nodes.append(mps_node)
 
 
-# [MPS TODO]: Works on a single iteration of llama2, but subsequent tokens
-# are wrong when using Index put. Disabling it for now.
 @register_node_visitor
 class IndexPutVisitor(NodeVisitor):
-    # target = "aten.index_put.default"
-    target = "disabled"
+    target = "aten.index_put.default"
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -115,6 +113,88 @@ def define_node(
         mps_graph.mps_nodes.append(mps_node)
 
 
+@register_node_visitor
+class SliceScatterVisitor(NodeVisitor):
+    target = "aten.slice_scatter.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+        self.invalid_val = 2**63 - 1
+
+    def maybe_wrap_dim(self, dim: int, n: int) -> List[int]:
+        if dim < 0:
+            wrapped_dim = dim + n
+            if wrapped_dim < 0:
+                wrapped_dim = 0
+            return wrapped_dim
+        elif dim > n:
+            return n
+        return dim
+
+    def get_exapnded_index(self, idx, shape, dim):
+        if idx.dim() == 0:
+            return idx.expand(shape)
+
+        dim = self.maybe_wrap_dim(dim, len(shape))
+
+        # setup new_index_shape as [BS, 1, ..., idx_size, ..., 1]
+        # to reshape index_
+        idx_size = idx.size(0)
+        new_index_shape = [1] * len(shape)
+        new_index_shape[dim] = idx_size
+
+        # Now apply expand to index_
+        index = idx.view(new_index_shape)
+        new_index_shape = list(shape)
+        new_index_shape[dim] = idx_size
+        index = index.expand(new_index_shape)
+
+        return index
+
+    def get_slice_scatter_indices(
+        self, dim, start, end, step, input_shape, dtype=torch.int64
+    ):
+        idx = torch.arange(start, end, step, dtype=dtype)
+        return self.get_exapnded_index(idx, input_shape, dim)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        mps_node = self.create_unary_node(node, mps_graph, MPSScatter)
+
+        start = None
+        end = None
+        step = 1
+
+        mps_node.mpsnode_union.src_id = self.define_tensor(
+            get_input_node(node, 1), mps_graph
+        )
+        if len(node.args) >= 3:
+            mps_node.mpsnode_union.dim = cast(int, node.args[2])
+        if len(node.args) >= 4:
+            start = cast(int, node.args[3])
+        if len(node.args) >= 5 and node.args[4] != self.invalid_val:
+            end = cast(int, node.args[4])
+        if len(node.args) >= 6:
+            step = cast(int, node.args[5])
+
+        input_shape = get_shape(get_input_node(node, 0))
+        dim_len = input_shape[
+            self.maybe_wrap_dim(mps_node.mpsnode_union.dim, len(input_shape))
+        ]
+
+        start_val = start if start is not None else 0
+        end_val = end if end is not None else dim_len
+
+        scatter_indices = self.get_slice_scatter_indices(
+            mps_node.mpsnode_union.dim, start_val, end_val, step, input_shape
+        )
+        mps_node.mpsnode_union.idx_id = self.define_constant(scatter_indices, mps_graph)
+        mps_graph.mps_nodes.append(mps_node)
+
+
 @register_node_visitor
 class EmbeddingVisitor(NodeVisitor):
     target = "aten.embedding.default"
diff --git a/backends/apple/mps/operators/node_visitor.py b/backends/apple/mps/operators/node_visitor.py
index e9f879db88a..0b9b2d5512c 100644
--- a/backends/apple/mps/operators/node_visitor.py
+++ b/backends/apple/mps/operators/node_visitor.py
@@ -143,6 +143,38 @@ def define_tensor_list(self, node: torch.fx.Node, mps_graph: MPSGraph) -> List[i
             mps_graph.mps_values.append(mps_tensor)
         return self.tensor_to_id[node]
 
+    def define_constant(
+        self,
+        constant_tensor: torch.tensor,
+        mps_graph: MPSGraph,
+    ):
+        """Defines a scalar value into the MPSGraph serialization schema
+
+        Args:
+            tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph
+            mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer
+        """
+        constant_tensor = constant_tensor.contiguous()
+        # MPS TODO: cache these values
+        id = len(mps_graph.mps_values)
+        self.tensor_to_id[constant_tensor] = id
+        mps_data_type = edge_dtype_to_mps_dtype(constant_tensor.dtype)
+        constant_buffer_size, constant_buffer, mps_data_type = self.get_serialized_data(
+            constant_tensor, mps_graph, mps_data_type, id
+        )
+        dims = list(constant_tensor.shape)
+
+        mps_tensor = MPSTensor(
+            datatype=mps_data_type,
+            num_dims=len(dims),
+            dims=dims,
+            constant_buffer_size=constant_buffer_size,
+            constant_buffer=constant_buffer,
+        )
+
+        mps_graph.mps_values.append(mps_tensor)
+        return id
+
     def define_scalar(
         self,
         val: Union[float, int],
@@ -157,6 +189,7 @@ def define_scalar(
         """
         assert isinstance(val, int) or isinstance(val, float)
 
+        # MPS TODO: cache these values
         id = len(mps_graph.mps_values)
         self.tensor_to_id[val] = id
 
diff --git a/backends/apple/mps/runtime/MPSGraphBuilder.h b/backends/apple/mps/runtime/MPSGraphBuilder.h
index e4e89d68691..29b9471ae9a 100644
--- a/backends/apple/mps/runtime/MPSGraphBuilder.h
+++ b/backends/apple/mps/runtime/MPSGraphBuilder.h
@@ -123,6 +123,7 @@ class MPSGraphBuilder {
   _DEFINE_MPS_OP(Embedding);
   _DEFINE_MPS_OP(IndexTensor);
   _DEFINE_MPS_OP(IndexPut);
+  _DEFINE_MPS_OP(Scatter);
   // Linear algebra ops
   _DEFINE_MPS_OP(MatMul);
   _DEFINE_MPS_OP(Addmm);
diff --git a/backends/apple/mps/runtime/operations/IndexingOps.mm b/backends/apple/mps/runtime/operations/IndexingOps.mm
index b4dcf192b46..6536aa52cf3 100644
--- a/backends/apple/mps/runtime/operations/IndexingOps.mm
+++ b/backends/apple/mps/runtime/operations/IndexingOps.mm
@@ -204,6 +204,30 @@
   return err;
 }
 
+Error
+MPSGraphBuilder::mpsScatterOp(NodePtr nodePtr) {
+  auto graphNode = nodePtr->mpsnode_union_as_MPSScatter();
+  ET_LOG(
+    Debug, "%s %d: %d",
+    __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
+  );
+
+  int64_t dim = graphNode->dim();
+  MPSGraphTensor* inputTensor = getMPSGraphTensor(graphNode->input1_id());
+  MPSGraphTensor* indicesTensor = getMPSGraphTensor(graphNode->idx_id());
+  MPSGraphTensor* updatesTensor = getMPSGraphTensor(graphNode->src_id());
+
+  _idToMPSGraphTensor[graphNode->output_id()] =
+    [_mpsGraph scatterAlongAxis:dim
+                 withDataTensor:inputTensor
+                  updatesTensor:updatesTensor
+                  indicesTensor:indicesTensor
+                           mode:MPSGraphScatterModeSet
+                           name:nil];
+  return Error::Ok;
+}
+
+
 } // namespace delegate
 } // namespace mps
 } // namespace executor
diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm
index 648421ee2cd..21c4a0d3e7b 100644
--- a/backends/apple/mps/runtime/operations/OperationUtils.mm
+++ b/backends/apple/mps/runtime/operations/OperationUtils.mm
@@ -181,6 +181,7 @@
     _DEFINE_MPS_NODE(Embedding);
     _DEFINE_MPS_NODE(IndexTensor);
     _DEFINE_MPS_NODE(IndexPut);
+    _DEFINE_MPS_NODE(Scatter);
     // Reduce ops
     _DEFINE_MPS_NODE(Mean);
     // Shape ops
diff --git a/backends/apple/mps/serialization/mps_graph_schema.py b/backends/apple/mps/serialization/mps_graph_schema.py
index 8134091a01d..6909926e8cf 100644
--- a/backends/apple/mps/serialization/mps_graph_schema.py
+++ b/backends/apple/mps/serialization/mps_graph_schema.py
@@ -456,6 +456,13 @@ class MPSIndexPut(MPSNode1x1):
     values_id: int = -1
 
 
+@dataclass
+class MPSScatter(MPSNode1x1):
+    dim: int = 0
+    idx_id: int = -1
+    src_id: int = -1
+
+
 ##
 ## Shape ops
 ##
@@ -703,6 +710,7 @@ class MPSArange:
     MPSEmbedding,
     MPSIndexTensor,
     MPSIndexPut,
+    MPSScatter,
     # Shape ops
     MPSPermute,
     MPSView,
diff --git a/backends/apple/mps/serialization/schema.fbs b/backends/apple/mps/serialization/schema.fbs
index 6ba2c937f32..6e089d4526f 100644
--- a/backends/apple/mps/serialization/schema.fbs
+++ b/backends/apple/mps/serialization/schema.fbs
@@ -166,6 +166,14 @@ table MPSIndexPut {
   output_id:int;
 }
 
+table MPSScatter {
+  input1_id:int;
+  output_id:int;
+  dim:long;
+  idx_id:int;
+  src_id:int;
+}
+
 // Shape ops.
 table MPSPermute {
   input1_id:int;
@@ -390,6 +398,7 @@ union MPSNodeUnion {
     MPSEmbedding,
     MPSIndexTensor,
     MPSIndexPut,
+    MPSScatter,
 
     // Reduce ops
     MPSMean,
diff --git a/backends/apple/mps/test/test_mps_indexing_ops.py b/backends/apple/mps/test/test_mps_indexing_ops.py
index 7991f1a165a..03709fc891a 100644
--- a/backends/apple/mps/test/test_mps_indexing_ops.py
+++ b/backends/apple/mps/test/test_mps_indexing_ops.py
@@ -201,7 +201,6 @@ def forward(self, x):
     #     )
 
     def test_mps_indexing_put_1(self):
-
         class IndexPut(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -223,3 +222,43 @@ def forward(self, x, y, z):
         self.lower_and_test_with_partitioner(
             module, model_inputs, func_name=inspect.stack()[0].function[5:]
         )
+
+    def test_mps_indexing_slice_scatter_1(self):
+        class IndexSliceScatter(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return x.slice_scatter(y, start=6)
+
+        module = IndexSliceScatter()
+        input = torch.zeros(8, 8)
+        src = torch.ones(2, 8)
+        model_inputs = (
+            input,
+            src,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
+    def test_mps_indexing_slice_scatter_2(self):
+        class IndexSliceScatter(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return x.slice_scatter(y, dim=1, start=2, end=6, step=2)
+
+        module = IndexSliceScatter()
+        input = torch.zeros(8, 8)
+        src = torch.ones(8, 2)
+        model_inputs = (
+            input,
+            src,
+        )
+
+        self.lower_and_test_with_partitioner(
+            module, model_inputs, func_name=inspect.stack()[0].function[5:]
+        )

From ebe701edad93721e22112a434906a76df0d4a764 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 13 May 2024 14:57:20 -0700
Subject: [PATCH 43/62] Install mpmath in pip installation (#3593)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3593

A followup to https://github.com/pytorch/executorch/issues/2209 and https://github.com/pytorch/executorch/issues/2228

Consistent with https://github.com/pytorch/executorch/blob/main/.ci/docker/requirements-ci.txt

Reviewed By: lucylq

Differential Revision: D57289472

fbshipit-source-id: e155f627fe537be85129c70da5fa91ae555b010a
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index b6926a2f5f6..ca5358e25fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,7 @@ dependencies=[
   "expecttest",
   "flatbuffers",
   "hypothesis",
+  "mpmath==1.3.0",
   "numpy>=1.25.2",
   "packaging",
   "pandas",

From c853b3cc4440a2a0f77e5de478a399ed955ef518 Mon Sep 17 00:00:00 2001
From: David Lin <lind@meta.com>
Date: Mon, 13 May 2024 15:19:17 -0700
Subject: [PATCH 44/62] Add base for sgd optimizer (#3496)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3496

This adds the sgd_optimizer header to executorch. would appreciate some thoughts on where to place this file.

Reviewed By: JacobSzwejbka

Differential Revision: D56888378

fbshipit-source-id: 17d6bb3975ae2d58aee911ee91a3ff07acbc6850
---
 extension/training/optimizer/TARGETS          |  8 +++
 extension/training/optimizer/sgd.h            | 49 +++++++++++++++++++
 extension/training/optimizer/targets.bzl      | 20 ++++++++
 extension/training/optimizer/test/TARGETS     |  8 +++
 .../training/optimizer/test/sgd_test.cpp      | 28 +++++++++++
 extension/training/optimizer/test/targets.bzl | 18 +++++++
 6 files changed, 131 insertions(+)
 create mode 100644 extension/training/optimizer/TARGETS
 create mode 100644 extension/training/optimizer/sgd.h
 create mode 100644 extension/training/optimizer/targets.bzl
 create mode 100644 extension/training/optimizer/test/TARGETS
 create mode 100644 extension/training/optimizer/test/sgd_test.cpp
 create mode 100644 extension/training/optimizer/test/targets.bzl

diff --git a/extension/training/optimizer/TARGETS b/extension/training/optimizer/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/training/optimizer/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/training/optimizer/sgd.h b/extension/training/optimizer/sgd.h
new file mode 100644
index 00000000000..a5f46b44066
--- /dev/null
+++ b/extension/training/optimizer/sgd.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * SGD (stochastic gradient descent) optimizer to perform on-device training.
+ * This uses the gradients calculated in the backwards pass of the loss function
+ * and updates the parameters such that it minimizes the loss.
+ *
+ * This is similar to the Lite Interpreter implementation of the SGD optimizer,
+ * but without the dependency on ATen Tensors and autograd.
+ */
+#pragma once
+
+namespace torch {
+namespace executor {
+namespace optimizer {
+
+/**
+ * SGD optimizer state. This keeps track of the state of a given parameter to
+ * be used in later epochs.
+ */
+class SGDParamState {};
+
+/**
+ * SGD optimizer options. This contains options for performing training on a
+ * param group, such as the learning rate.
+ */
+class SGDOptions {};
+
+/**
+ * SGD optimizer param group. This contains the parameters and
+ * the OptimizerOptions associated to it.
+ */
+class SGDParamGroup {};
+
+/**
+ * SGD optimizer class. This is responsible for performing the optimization
+ * step.
+ */
+class SGD {};
+
+} // namespace optimizer
+} // namespace executor
+} // namespace torch
diff --git a/extension/training/optimizer/targets.bzl b/extension/training/optimizer/targets.bzl
new file mode 100644
index 00000000000..ffe8e30d7b6
--- /dev/null
+++ b/extension/training/optimizer/targets.bzl
@@ -0,0 +1,20 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "optimizer",
+        exported_headers = [
+            "sgd.h",
+        ],
+        exported_deps = [
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
diff --git a/extension/training/optimizer/test/TARGETS b/extension/training/optimizer/test/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/training/optimizer/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/training/optimizer/test/sgd_test.cpp b/extension/training/optimizer/test/sgd_test.cpp
new file mode 100644
index 00000000000..1d35e43458f
--- /dev/null
+++ b/extension/training/optimizer/test/sgd_test.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/training/optimizer/sgd.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using namespace torch::executor::optimizer;
+
+class SGDOptimizerTest : public ::testing::Test {};
+
+TEST_F(SGDOptimizerTest, InstantiateTypes) {
+  SGDParamState state;
+  SGDOptions options;
+  SGDParamGroup param_group;
+  SGD sgd;
+
+  EXPECT_TRUE(dynamic_cast<SGDParamState*>(&state) != nullptr);
+  EXPECT_TRUE(dynamic_cast<SGDOptions*>(&options) != nullptr);
+  EXPECT_TRUE(dynamic_cast<SGDParamGroup*>(&param_group) != nullptr);
+  EXPECT_TRUE(dynamic_cast<SGD*>(&sgd) != nullptr);
+}
diff --git a/extension/training/optimizer/test/targets.bzl b/extension/training/optimizer/test/targets.bzl
new file mode 100644
index 00000000000..9d380f90a14
--- /dev/null
+++ b/extension/training/optimizer/test/targets.bzl
@@ -0,0 +1,18 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_test(
+        name = "sgd_test",
+        srcs = [
+            "sgd_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/training/optimizer:optimizer",
+        ],
+    )

From c69861ddcee5705d37e7d47e2cccdeac741c71e8 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@meta.com>
Date: Mon, 13 May 2024 15:23:24 -0700
Subject: [PATCH 45/62] Ease Core ML partitioner and quantizer imports. (#3564)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3564

.

Reviewed By: mcr229

Differential Revision: D57172491

fbshipit-source-id: c7724130d973ca8e7df510e9d5eb95c329c4c2bd
---
 backends/apple/coreml/README.md                       | 4 ++--
 backends/apple/coreml/partition/__init__.py           | 9 +++++++++
 backends/apple/coreml/quantizer/__init__.py           | 9 +++++++++
 backends/apple/coreml/test/test_coreml_partitioner.py | 4 +---
 backends/apple/coreml/test/test_coreml_quantizer.py   | 2 +-
 examples/apple/coreml/scripts/export.py               | 4 +---
 examples/models/llama2/lib/partitioner_lib.py         | 8 +++-----
 7 files changed, 26 insertions(+), 14 deletions(-)
 create mode 100644 backends/apple/coreml/partition/__init__.py
 create mode 100644 backends/apple/coreml/quantizer/__init__.py

diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md
index 4a21d8d8ae1..05b56e9c788 100644
--- a/backends/apple/coreml/README.md
+++ b/backends/apple/coreml/README.md
@@ -28,7 +28,7 @@ import torch
 import executorch.exir
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
-from executorch.backends.apple.coreml.partition.coreml_partitioner import CoreMLPartitioner
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 
 class Model(torch.nn.Module):
     def __init__(self):
@@ -72,7 +72,7 @@ from torch.ao.quantization.quantize_pt2e import (
     prepare_qat_pt2e,
 )
 
-from executorch.backends.apple.coreml.quantizer.coreml_quantizer import CoreMLQuantizer
+from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
 from coremltools.optimize.torch.quantization.quantization_config import (
     LinearQuantizerConfig,
     QuantizationScheme,
diff --git a/backends/apple/coreml/partition/__init__.py b/backends/apple/coreml/partition/__init__.py
new file mode 100644
index 00000000000..1630e9ece45
--- /dev/null
+++ b/backends/apple/coreml/partition/__init__.py
@@ -0,0 +1,9 @@
+# Copyright © 2023 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+from .coreml_partitioner import CoreMLPartitioner
+
+__all__ = [
+    CoreMLPartitioner,
+]
diff --git a/backends/apple/coreml/quantizer/__init__.py b/backends/apple/coreml/quantizer/__init__.py
new file mode 100644
index 00000000000..f6282834fa1
--- /dev/null
+++ b/backends/apple/coreml/quantizer/__init__.py
@@ -0,0 +1,9 @@
+# Copyright © 2023 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+from .coreml_quantizer import CoreMLQuantizer
+
+__all__ = [
+    CoreMLQuantizer,
+]
diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
index e59e5c95544..45c468e450b 100644
--- a/backends/apple/coreml/test/test_coreml_partitioner.py
+++ b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -9,9 +9,7 @@
 import torch
 import torchvision
 
-from executorch.backends.apple.coreml.partition.coreml_partitioner import (
-    CoreMLPartitioner,
-)
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 
 
 class TestCoreMLPartitioner(unittest.TestCase):
diff --git a/backends/apple/coreml/test/test_coreml_quantizer.py b/backends/apple/coreml/test/test_coreml_quantizer.py
index 67eee3593fd..c05cde05a0a 100644
--- a/backends/apple/coreml/test/test_coreml_quantizer.py
+++ b/backends/apple/coreml/test/test_coreml_quantizer.py
@@ -14,7 +14,7 @@
     QuantizationScheme,
 )
 
-from executorch.backends.apple.coreml.quantizer.coreml_quantizer import CoreMLQuantizer
+from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import (
     convert_pt2e,
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index 966714ba31c..4bf26a7f3ea 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -16,9 +16,7 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 
-from executorch.backends.apple.coreml.partition.coreml_partitioner import (
-    CoreMLPartitioner,
-)
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 from executorch.exir import to_edge
 
 from executorch.exir.backend.backend_api import to_backend
diff --git a/examples/models/llama2/lib/partitioner_lib.py b/examples/models/llama2/lib/partitioner_lib.py
index 1638a357576..c11e74a7e0b 100644
--- a/examples/models/llama2/lib/partitioner_lib.py
+++ b/examples/models/llama2/lib/partitioner_lib.py
@@ -57,16 +57,14 @@ def get_coreml_partitioner(args):
         args.use_kv_cache is True
     ), "CoreML backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
     try:
-        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.partition.coreml_partitioner`.
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `coremltools`.
         import coremltools as ct
 
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.compiler`
         from executorch.backends.apple.coreml.compiler import CoreMLBackend
 
-        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.partition.coreml_partitioner`
-        from executorch.backends.apple.coreml.partition.coreml_partitioner import (
-            CoreMLPartitioner,
-        )
+        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.coreml.partition`
+        from executorch.backends.apple.coreml.partition import CoreMLPartitioner
     except ImportError:
         raise ImportError(
             "Please install the CoreML backend follwing https://pytorch.org/executorch/main/build-run-coreml.html"

From 01ce72c93bd7dd620f1150625c622c06dbaa10b6 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@meta.com>
Date: Tue, 14 May 2024 11:18:35 -0700
Subject: [PATCH 46/62] Fix includes in Core ML backend. (#3603)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3603

Some headers are relying on transitive includes, that may be missing when building for different platforms, so we have to include everything explicitly. Also, need to use quotes over angle parenthesis for local headers includes.

Reviewed By: kirklandsign

Differential Revision: D57340360

fbshipit-source-id: dedc9737314231be5255c06c3ad7c9a800b247b8
---
 .../inmemoryfs/inmemory_filesystem.cpp        |  4 +-
 .../inmemoryfs/inmemory_filesystem.hpp        | 87 ++++++++++---------
 .../inmemory_filesystem_metadata.hpp          |  6 +-
 .../inmemoryfs/inmemory_filesystem_py.cpp     | 13 +--
 .../inmemoryfs/inmemory_filesystem_utils.cpp  | 13 +--
 .../inmemoryfs/inmemory_filesystem_utils.mm   | 35 ++++----
 .../runtime/inmemoryfs/memory_buffer.cpp      |  3 +-
 .../runtime/inmemoryfs/memory_buffer.hpp      | 43 ++++-----
 .../runtime/inmemoryfs/memory_stream.cpp      |  2 +
 .../runtime/inmemoryfs/memory_stream.hpp      | 17 ++--
 .../inmemoryfs/reversed_memory_stream.cpp     |  2 +
 .../inmemoryfs/reversed_memory_stream.hpp     | 13 ++-
 .../apple/coreml/runtime/util/json_util.cpp   |  2 +-
 .../coreml/runtime/util/objc_json_serde.mm    |  2 +-
 14 files changed, 128 insertions(+), 114 deletions(-)

diff --git a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp
index bddb7e4d410..f699316cfdb 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -10,7 +10,6 @@
 #include <assert.h>
 #include <fstream>
 #include <iostream>
-#include <range.hpp>
 #include <sstream>
 
 #if __has_include(<filesystem>)
@@ -22,7 +21,8 @@ namespace filesystem = std::experimental::filesystem;
 }
 #endif
 
-#include <reversed_memory_stream.hpp>
+#include "range.hpp"
+#include "reversed_memory_stream.hpp"
 
 namespace {
 using namespace inmemoryfs;
diff --git a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.hpp b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.hpp
index fedf4190334..d0ace1a5250 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.hpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.hpp
@@ -8,14 +8,15 @@
 #pragma once
 
 #include <functional>
-#include <inmemory_filesystem_metadata.hpp>
 #include <memory>
-#include <memory_buffer.hpp>
 #include <optional>
 #include <stdio.h>
 #include <string>
 #include <system_error>
 
+#include "inmemory_filesystem_metadata.hpp"
+#include "memory_buffer.hpp"
+
 namespace inmemoryfs {
 
 /// A class representing an in-memory file system.
@@ -29,36 +30,36 @@ class InMemoryFileSystem final {
         DirectoryExpected,     // If path is not a directory.
         FileExpected,          // If the path is not a file.
     };
-    
+
     /// Options for loading file content.
     enum class FileLoadOption: int8_t {
         Malloc = 1,   // Copy file contents into memory.
         MMap,         // Memory map file contents.
         LazyMMap      // Memory map file contents but lazily.
     };
-    
+
     /// The error category for `InMemoryFileSystem`.
     struct ErrorCategory final: public std::error_category {
     public:
         inline const char* name() const noexcept override {
             return "InMemoryFileSystem";
         }
-        
+
         std::string message(int code) const override;
     };
-    
+
     struct Attributes {
         time_t modificationTime;
-        
+
         inline Attributes() noexcept:
         modificationTime(time(0))
         {}
     };
-    
+
     using MetadataWriter = std::function<bool(const InMemoryFileSystemMetadata&, std::ostream&)>;
     using MetadataWriterInMemory = std::function<size_t(const InMemoryFileSystemMetadata&, void *)>;
     using MetadataReader = std::function<std::optional<InMemoryFileSystemMetadata>(std::istream&)>;
-    
+
     /// A class representing an in-memory node. This could either be a file node or a directory node.
     class InMemoryNode {
     public:
@@ -67,7 +68,7 @@ class InMemoryFileSystem final {
             File = 0,   /// Node is a File.
             Directory   /// Node is a Directory.
         };
-        
+
         /// Constructs an in-memory node instance.
         ///
         /// @param name The name of the Node. It must be unique in the enclosing Directory.
@@ -78,38 +79,38 @@ class InMemoryFileSystem final {
         attributes_(std::move(attributes)),
         kind_(kind)
         {}
-        
+
         InMemoryNode(InMemoryNode const&) = delete;
         InMemoryNode& operator=(InMemoryNode const&) = delete;
-        
+
         inline virtual ~InMemoryNode() {}
-        
+
         /// Returns the node attributes.
         inline Attributes attributes() const noexcept {
             return attributes_;
         }
-        
+
         /// Sets the node attributes.
         ///
         /// @param attributes The node attributes.
         inline void set_attributes(Attributes attributes) noexcept {
             attributes_ = std::move(attributes);
         }
-        
+
         /// Returns the node kind, possible values are `File` and `Directory`.
         inline Kind kind() const noexcept {
             return kind_;
         }
-        
+
         /// Returns the name of the node.
         inline const std::string& name() const noexcept {
             return name_;
         }
-        
+
         inline void set_name(std::string name) noexcept {
             std::swap(name_, name);
         }
-        
+
         /// Returns `true` if the node is a directory otherwise `false`.
         inline bool isDirectory() const noexcept {
             switch (kind_) {
@@ -119,58 +120,58 @@ class InMemoryFileSystem final {
                     return false;
             }
         }
-        
+
         /// Returns `true` if the node is a file otherwise `false`.
         inline bool isFile() const noexcept {
             return !isDirectory();
         }
-        
+
     private:
         std::string name_;
         InMemoryFileSystem::Attributes attributes_;
         const Kind kind_;
     };
-    
+
     /// Constructs an`InMemoryFileSystem` instance with an empty root and the specified name.
     ///
     /// @param rootName The name of the root node.
     explicit InMemoryFileSystem(std::string rootName = "root") noexcept;
-    
+
     /// Constructs an`InMemoryFileSystem` instance with the specified root.
     ///
     /// @param root The root node.
     explicit InMemoryFileSystem(std::unique_ptr<InMemoryNode> root) noexcept
     :root_(std::move(root))
     {}
-    
+
     InMemoryFileSystem(InMemoryFileSystem const&) = delete;
     InMemoryFileSystem& operator=(InMemoryFileSystem const&) = delete;
-    
+
     virtual ~InMemoryFileSystem() {}
-    
+
     /// Returns the root.
     InMemoryNode *root() const noexcept {
         return root_.get();
     }
-    
+
     /// Checks if the node at the specified path is a directory.
     ///
     /// @param canonical_path   The path components from the root.
     /// @retval `true` if the node at the specified path is a directory otherwise `false`.
     bool is_directory(const std::vector<std::string>& canonical_path) noexcept;
-    
+
     /// Checks if the node at the specified path is a file.
     ///
     /// @param canonical_path   The path components from the root.
     /// @retval `true` if the node at the specified path is a file otherwise `false`.
     bool is_file(const std::vector<std::string>& canonical_path) noexcept;
-    
+
     /// Checks if the node at the specified path exists.
     ///
     /// @param canonical_path   The path components from the root.
     /// @retval `true` if the node at the specified path exists.
     bool exists(const std::vector<std::string>& canonical_path) const noexcept;
-    
+
     /// Retrieves the canonical path of all the child nodes at the specified path. The node
     /// at the specified path must be a directory otherwise it returns an empty vector with the `error`
     /// populated.
@@ -180,7 +181,7 @@ class InMemoryFileSystem final {
     /// @retval paths to all the items at the specified path.
     std::vector<std::vector<std::string>> get_item_paths(const std::vector<std::string>& canonical_path,
                                                          std::error_code& error) const noexcept;
-    
+
     /// Retrieves the attributes of the item at the specified path.
     ///
     /// @param canonical_path  The path components from the root.
@@ -188,7 +189,7 @@ class InMemoryFileSystem final {
     /// @retval The item attributes at the specified path.
     std::optional<Attributes> get_attributes(const std::vector<std::string>& canonical_path,
                                              std::error_code& error) const noexcept;
-    
+
     /// Retrieves the contents of the file at the specified path.
     ///
     /// @param canonical_path  The path components from the root.
@@ -196,7 +197,7 @@ class InMemoryFileSystem final {
     /// @retval The file contents or `nullptr` if the item at the specified path is not a file.
     std::shared_ptr<MemoryBuffer> get_file_content(const std::vector<std::string>& canonical_path,
                                                    std::error_code& error) const noexcept;
-    
+
     /// Creates an in-memory directory at the specified path.
     ///
     /// @param canonical_path  The path components from the root.
@@ -208,7 +209,7 @@ class InMemoryFileSystem final {
                         Attributes attributes,
                         bool create_intermediate_directories,
                         std::error_code& error) noexcept;
-    
+
     /// Creates an in-memory file at the specified path.
     ///
     /// @param canonical_path  The path components from the root.
@@ -222,7 +223,7 @@ class InMemoryFileSystem final {
                    Attributes attributes,
                    bool overwrite,
                    std::error_code& error) noexcept;
-    
+
     /// Removes the item at the specified path.
     ///
     /// @param canonical_path  The path components from the root.
@@ -230,7 +231,7 @@ class InMemoryFileSystem final {
     /// @retval `true` if the item is removed otherwise `false`.
     bool remove_item(const std::vector<std::string>& canonical_path,
                      std::error_code& error) noexcept;
-    
+
     /// Sets the attributes at the specified path.
     ///
     /// @param canonical_path  The path components from the root.
@@ -239,7 +240,7 @@ class InMemoryFileSystem final {
     bool set_attributes(const std::vector<std::string>& canonical_path,
                         Attributes attributes,
                         std::error_code& error) noexcept;
-    
+
     /// Writes the item at the specified path to the filesystem.
     ///
     /// @param canonical_path  The path components from the root.
@@ -251,7 +252,7 @@ class InMemoryFileSystem final {
                             const std::string& dst_path,
                             bool recursive,
                             std::error_code& error) const noexcept;
-    
+
     /// Renames the item at the specified path, if there is already an item with the same name then
     /// the rename would fail.
     ///
@@ -262,7 +263,7 @@ class InMemoryFileSystem final {
     bool rename_item(const std::vector<std::string>& canonical_path,
                      const std::string& name,
                      std::error_code& error) noexcept;
-    
+
     /// Creates  an`InMemoryFileSystem` from the filesystem path.
     ///
     /// The structure of the `InMemoryFileSystem` is identical to the structure of the filesystem at the
@@ -275,7 +276,7 @@ class InMemoryFileSystem final {
     static std::unique_ptr<InMemoryFileSystem> make_from_directory(const std::string& path,
                                                                    FileLoadOption option,
                                                                    std::error_code& error) noexcept;
-    
+
     /// Serializes the item at the specified path and writes it to the stream.
     ///
     /// The structure of the `InMemoryFileSystem` is identical to the structure of the filesystem at the
@@ -292,7 +293,7 @@ class InMemoryFileSystem final {
                    const MetadataWriter& metadata_writer,
                    std::ostream& ostream,
                    std::error_code& error) const noexcept;
-    
+
     /// Serializes the item at the specified path and writes it to the stream.
     ///
     /// The structure of the `InMemoryFileSystem` is identical to the structure of the filesystem at the
@@ -309,7 +310,7 @@ class InMemoryFileSystem final {
                    const MetadataWriterInMemory& metadata_writer,
                    void *dst,
                    std::error_code& error) const noexcept;
-    
+
     /// Computes the size of the buffer that would be needed to serialized the item at the specified path.
     ///
     /// @param canonical_path  The path components from the root.
@@ -319,7 +320,7 @@ class InMemoryFileSystem final {
     size_t get_buffer_size_for_serialization(const std::vector<std::string>& canonical_path,
                                              size_t alignment,
                                              const MetadataWriter& metadata_writer) const noexcept;
-    
+
     /// Constructs an `InMemoryFileSystem` instance from the buffer contents.
     ///
     /// @param buffer  The memory buffer.
@@ -327,7 +328,7 @@ class InMemoryFileSystem final {
     /// @retval The constructed `InMemoryFileSystem` or `nullptr` if the deserialization failed.
     static std::unique_ptr<InMemoryFileSystem> make_from_buffer(const std::shared_ptr<MemoryBuffer>& buffer,
                                                                 const MetadataReader& metadata_reader) noexcept;
-    
+
 private:
     const std::unique_ptr<InMemoryNode> root_;
 };
diff --git a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_metadata.hpp b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_metadata.hpp
index d9a807a7fc7..4f183205b05 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_metadata.hpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_metadata.hpp
@@ -7,11 +7,12 @@
 
 #pragma once
 
-#include <memory_buffer.hpp>
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include <range.hpp>
+
+#include "memory_buffer.hpp"
+#include "range.hpp"
 
 namespace inmemoryfs {
 
@@ -27,4 +28,3 @@ struct InMemoryFileSystemMetadata {
 };
 
 } // namespace inmemoryfs
-
diff --git a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_py.cpp b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_py.cpp
index 90bc0eb3e1b..66ffa697654 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_py.cpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_py.cpp
@@ -6,13 +6,9 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 
-#include <inmemory_filesystem_utils.hpp>
 #include <iostream>
 #include <memory>
-#include <memory_buffer.hpp>
-#include <memory_stream.hpp>
-#include <pybind11/pybind11.h>
-#include <pybind11/pytypes.h>
+#include <mutex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -21,6 +17,13 @@
 #include <thread>
 #include <unistd.h>
 
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+
+#include "inmemory_filesystem_utils.hpp"
+#include "memory_buffer.hpp"
+#include "memory_stream.hpp"
+
 #if __has_include(<filesystem>)
 #include <filesystem>
 #elif __has_include(<experimental/filesystem>)
diff --git a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.cpp b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.cpp
index 1dffacf15a5..a7810e23db3 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.cpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.cpp
@@ -5,14 +5,17 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#include <inmemory_filesystem_metadata.hpp>
-#include <inmemory_filesystem_metadata_keys.hpp>
-#include <inmemory_filesystem_utils.hpp>
+#include "inmemory_filesystem_utils.hpp"
+
 #include <iostream>
-#include <json.hpp>
-#include <json_util.hpp>
 #include <sstream>
 
+#include <nlohmann/json.hpp>
+
+#include "inmemory_filesystem_metadata.hpp"
+#include "inmemory_filesystem_metadata_keys.hpp"
+#include "json_util.hpp"
+
 namespace inmemoryfs {
 
 using json = nlohmann::json;
diff --git a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.mm b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.mm
index 1f018e3c74a..309b95e8d85 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.mm
+++ b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem_utils.mm
@@ -6,15 +6,18 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 #import "inmemory_filesystem_utils.hpp"
-#import <Foundation/Foundation.h>
-#import <inmemory_filesystem_metadata.hpp>
-#import <inmemory_filesystem_metadata_keys.hpp>
+
 #import <iostream>
-#import <json_util.hpp>
-#import <objc_json_serde.h>
 #import <sstream>
 #import <unordered_map>
 
+#import <Foundation/Foundation.h>
+
+#import "inmemory_filesystem_metadata.hpp"
+#import "inmemory_filesystem_metadata_keys.hpp"
+#import "json_util.hpp"
+#import "objc_json_serde.h"
+
 namespace executorchcoreml {
 namespace serde {
 namespace json {
@@ -29,13 +32,13 @@ static id to_json(const Range& range) {
             to_string(RangeKeys::kSize) : to_json_value(range.size)
         };
     }
-    
+
     static void from_json(id json, Range& range) {
         NSDictionary<NSString *, id> *json_dict = SAFE_CAST(json, NSDictionary);
         if (!json_dict) {
             return;
         }
-        
+
         from_json_value(json_dict[to_string(RangeKeys::kOffset)], range.offset);
         from_json_value(json_dict[to_string(RangeKeys::kSize)], range.size);
     }
@@ -51,13 +54,13 @@ static id to_json(const InMemoryNodeMetadata& node) {
             to_string(InMemoryNodeMetadataKeys::kKind) : to_json_value(node.kind)
         };
     }
-    
+
     static void from_json(id json, InMemoryNodeMetadata& node) {
         NSDictionary<NSString *, id> *json_dict = SAFE_CAST(json, NSDictionary);
         if (!json_dict) {
             return;
         }
-        
+
         from_json_value(json_dict[to_string(InMemoryNodeMetadataKeys::kName)], node.name);
         from_json_value(json_dict[to_string(InMemoryNodeMetadataKeys::kDataRegion)], node.data_region);
         from_json_value(json_dict[to_string(InMemoryNodeMetadataKeys::kChildIndices)], node.child_name_to_indices_map);
@@ -72,13 +75,13 @@ static id to_json(const InMemoryFileSystemMetadata& fs) {
             to_string(InMemoryFileSystemMetadataKeys::kNodes) : to_json_value(fs.nodes)
         };
     }
-    
+
     static void from_json(id json, InMemoryFileSystemMetadata& fs) {
         NSDictionary<NSString *, id> *json_dict = SAFE_CAST(json, NSDictionary);
         if (!json_dict) {
             return;
         }
-        
+
         from_json_value(json_dict[to_string(InMemoryFileSystemMetadataKeys::kNodes)], fs.nodes);
     }
 };
@@ -114,7 +117,7 @@ size_t write_metadata_to_buffer(const InMemoryFileSystemMetadata& metadata, void
     if (!json_object) {
         return std::optional<InMemoryFileSystemMetadata>();
     }
-    
+
     InMemoryFileSystemMetadata metadata;
     Converter<InMemoryFileSystemMetadata>::from_json(to_json_object(json_object.value()), metadata);
     return metadata;
@@ -132,7 +135,7 @@ bool serialize(const InMemoryFileSystem& file_system,
         write_metadata_to_stream(fs_metadata, stream);
         return true;
     };
-    
+
     return file_system.serialize(canonical_path, alignment, metadata_writer, ostream, ec);
 }
 
@@ -145,7 +148,7 @@ bool serialize(const InMemoryFileSystem& file_system,
                                                                     void *metadata_dst) {
         return ::write_metadata_to_buffer(fs_metadata, metadata_dst);
     };
-    
+
     return file_system.serialize(canonical_path, alignment, metadata_writer, dst, ec);
 }
 
@@ -156,7 +159,7 @@ size_t get_buffer_size_for_serialization(const InMemoryFileSystem& file_system,
                                                             std::ostream& stream) {
         return ::write_metadata_to_stream(fs_metadata, stream);
     };
-    
+
     return file_system.get_buffer_size_for_serialization(canonical_path, alignment, metadata_writer);
 }
 
@@ -164,7 +167,7 @@ size_t get_buffer_size_for_serialization(const InMemoryFileSystem& file_system,
     InMemoryFileSystem::MetadataReader metadata_reader = [](std::istream& stream) {
         return ::read_metadata_from_stream(stream);
     };
-    
+
     return InMemoryFileSystem::make_from_buffer(buffer, metadata_reader);
 }
 } // namespace inmemoryfs
diff --git a/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.cpp b/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.cpp
index 61b50a54655..c4485569d56 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.cpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.cpp
@@ -5,9 +5,10 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#include <memory_buffer.hpp>
+#include "memory_buffer.hpp"
 
 #include <assert.h>
+#include <cstring>
 #include <functional>
 #include <iostream>
 #include <mutex>
diff --git a/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.hpp b/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.hpp
index e6e33f5ce26..5243401e2df 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.hpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/memory_buffer.hpp
@@ -8,12 +8,13 @@
 #pragma once
 
 #include <memory>
-#include <range.hpp>
 #include <stdio.h>
 #include <string>
 #include <system_error>
 #include <vector>
 
+#include "range.hpp"
+
 namespace inmemoryfs {
 /// A class representing a memory buffer.
 class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
@@ -23,38 +24,38 @@ class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
         MMap = 0,  // If the buffer is memory mapped.
         Malloc ,   // If the buffer is heap allocated.
     };
-    
+
     enum class ReadOption: uint8_t {
         Malloc = 0,
         MMap,
         LazyMMap
     };
-    
+
     inline MemoryBuffer(void *data,
                         size_t size,
                         Kind kind = Kind::Malloc,
                         std::shared_ptr<MemoryBuffer> parent = nullptr) noexcept:
-    data_(data), 
+    data_(data),
     size_(size),
     kind_(kind),
     parent_(parent)
     {}
-    
+
     MemoryBuffer(const MemoryBuffer &) = delete;
     MemoryBuffer &operator=(const MemoryBuffer &) = delete;
-    
+
     virtual ~MemoryBuffer() noexcept {}
-    
+
     /// Returns the underlying data.
     virtual inline void *data() noexcept {
         return data_;
     }
-    
+
     /// Returns the size of the buffer.
     inline const size_t size() const noexcept {
         return size_;
     }
-    
+
     /// Loads the contents of the buffer.
     ///
     /// - For a malloced buffer, the method is a no op, content is loaded at the initialization time.
@@ -65,12 +66,12 @@ class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
     inline virtual bool load(std::error_code& error) noexcept {
         return true;
     }
-    
+
     /// Returns the kind of the buffer.
     inline const Kind kind() const noexcept {
         return kind_;
     }
-    
+
     /// Returns the offset range that would be used when writing the buffer content.
     ///
     /// @param proposed_offset The proposed offset.
@@ -78,7 +79,7 @@ class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
     inline virtual std::pair<size_t, size_t> get_offset_range(size_t proposed_offset) const noexcept {
         return {proposed_offset, proposed_offset};
     }
-    
+
     /// Returns the revised range that must be used for writing.
     ///
     /// @param dst  The destination pointer.
@@ -87,7 +88,7 @@ class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
     inline virtual Range get_revised_range_for_writing(void *dst, Range proposed_range) const noexcept {
         return proposed_range;
     }
-    
+
     /// Writes the contents of the buffer to the destination buffer at the given offset.
     ///
     /// @param dst The destination pointer.
@@ -97,13 +98,13 @@ class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
     virtual bool write(void *dst,
                        size_t offset,
                        std::error_code& error) noexcept;
-    
+
     /// Slices a buffer.
     ///
     /// @param range The memory range.
     /// @retval The sliced buffer if the region is inside the buffer otherwise `nullptr`.
     virtual std::shared_ptr<MemoryBuffer> slice(Range range) noexcept;
-    
+
     /// Reads the file content at the specified path.
     ///
     /// @param file_path The file path.
@@ -116,7 +117,7 @@ class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
                       const std::vector<Range>& ranges,
                       ReadOption option,
                       std::error_code& error);
-    
+
     /// Reads the whole file content at the specified path.
     ///
     /// @param file_path The file path.
@@ -127,28 +128,28 @@ class MemoryBuffer: public std::enable_shared_from_this<MemoryBuffer> {
     read_file_content(const std::string& file_path,
                       ReadOption option,
                       std::error_code& error);
-    
+
     /// Constructs a `MemoryBuffer`.
     ///
     /// @param size The size of the buffer.
     /// @param alignment The address alignment.
     static std::unique_ptr<MemoryBuffer>
     make_using_malloc(size_t size, size_t alignment = 1);
-    
-    
+
+
     /// Constructs a `MemoryBuffer` from memory allocated using `mmap`.
     ///
     /// @param size The size of the buffer.
     static std::unique_ptr<MemoryBuffer>
     make_using_mmap(size_t size);
-    
+
     /// Constructs a `MemoryBuffer` without copying data.
     ///
     /// @param data The buffer content.
     /// @param size The size of the buffer.
     static std::unique_ptr<MemoryBuffer>
     make_unowned(void *data, size_t size);
-    
+
     /// Constructs a `MemoryBuffer` with copying data.
     ///
     /// @param data The buffer content.
diff --git a/backends/apple/coreml/runtime/inmemoryfs/memory_stream.cpp b/backends/apple/coreml/runtime/inmemoryfs/memory_stream.cpp
index 5078db66e80..cb634234c5c 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/memory_stream.cpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/memory_stream.cpp
@@ -7,6 +7,8 @@
 
 #include "memory_stream.hpp"
 
+#include <limits>
+
 namespace inmemoryfs {
 
 MemoryStreamBuf::MemoryStreamBuf(const std::shared_ptr<MemoryBuffer>& buffer) noexcept : buffer_(buffer) {
diff --git a/backends/apple/coreml/runtime/inmemoryfs/memory_stream.hpp b/backends/apple/coreml/runtime/inmemoryfs/memory_stream.hpp
index a5f40f26b5f..f7a8100f74f 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/memory_stream.hpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/memory_stream.hpp
@@ -7,18 +7,18 @@
 
 #pragma once
 
-#include <memory_buffer.hpp>
-
 #include <istream>
 #include <ostream>
 
+#include "memory_buffer.hpp"
+
 namespace inmemoryfs {
 
 /// A class representing an in-memory stream buffer.
 class MemoryStreamBuf: public std::streambuf {
 public:
     ~MemoryStreamBuf() = default;
-    
+
     /// Constructs a `MemoryStreamBuf` from a `MemoryBuffer`.
     ///
     /// @param buffer  The memory buffer.
@@ -31,7 +31,7 @@ class MemoryStreamBuf: public std::streambuf {
     /// @param dir  The seek direction.
     /// @retval The stream position.
     pos_type iseekoff(off_type offset, std::ios_base::seekdir dir);
-    
+
     /// Called by `seekof` if the `openmode` is output.
     ///
     /// @param offset  The offset  value relative to the `dir`.
@@ -44,7 +44,7 @@ class MemoryStreamBuf: public std::streambuf {
     /// @param which  The open mode.
     /// @retval The stream position.
     pos_type seekpos(pos_type pos, std::ios_base::openmode which) override;
-    
+
     /// Called by the public member function `pubseekoff` to alter the stream position.
     ///
     /// @param offset  The offset  value relative to the `dir`.
@@ -74,18 +74,18 @@ class MemoryStreamBuf: public std::streambuf {
     ///
     /// Returns the value of the current character, converted to a value of type int.
     std::streambuf::int_type uflow() override;
-    
+
     /// Called by other member functions to put a character into the controlled output sequence.
     ///
     /// Returns the value of the character that's put into the stream, converted to a value of type int.
     int_type overflow(int_type ch) override;
-    
+
     /// Retrieves characters from the controlled input sequence and stores them in the array pointed by s,
     /// until either n characters have been extracted or the end of the sequence is reached.
     ///
     /// Returns the number of characters copied.
     std::streamsize xsgetn(char *s, std::streamsize n) override;
-    
+
     /// Writes characters from the array pointed to by s into the controlled output sequence,
     /// until either n characters have been written or the end of the output sequence is reached.
     ///
@@ -122,4 +122,3 @@ class MemoryOStream final : public std::ostream  {
 };
 
 }
-
diff --git a/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.cpp b/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.cpp
index e38b9d08b19..7fe6c26ca41 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.cpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.cpp
@@ -7,6 +7,8 @@
 
 #include "reversed_memory_stream.hpp"
 
+#include <limits>
+
 namespace inmemoryfs {
 
 ReversedIMemoryStreamBuf::ReversedIMemoryStreamBuf(std::shared_ptr<MemoryBuffer> buffer) noexcept
diff --git a/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.hpp b/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.hpp
index 1827af36413..09b3606bfe0 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.hpp
+++ b/backends/apple/coreml/runtime/inmemoryfs/reversed_memory_stream.hpp
@@ -7,18 +7,18 @@
 
 #pragma once
 
-#include <memory_buffer.hpp>
-
 #include <istream>
 #include <ostream>
 
+#include "memory_buffer.hpp"
+
 namespace inmemoryfs {
 
 /// A class for reading an in-memory stream buffer in reverse.
 class ReversedIMemoryStreamBuf: public std::streambuf {
 public:
     ~ReversedIMemoryStreamBuf() = default;
-    
+
     /// Constructs a `ReversedIMemoryStreamBuf` from a `MemoryBuffer`.
     ///
     /// @param buffer  The memory buffer.
@@ -50,7 +50,7 @@ class ReversedIMemoryStreamBuf: public std::streambuf {
     ///
     /// Returns the value of the current character, converted to a value of type int.
     std::streambuf::int_type uflow() override;
-    
+
     /// Retrieves characters from the controlled input sequence and stores them in the array pointed by s,
     /// until either n characters have been extracted or the end of the sequence is reached.
     ///
@@ -60,7 +60,7 @@ class ReversedIMemoryStreamBuf: public std::streambuf {
 private:
     /// Reads the character at the specified position.
     std::streambuf::int_type read(char *pos);
-    
+
     const std::shared_ptr<MemoryBuffer> buffer_;
     char *start_;
     char *current_;
@@ -70,7 +70,7 @@ class ReversedIMemoryStreamBuf: public std::streambuf {
 /// A class for reading an in-memory buffer in reverse.
 class ReversedIMemoryStream final : public std::istream  {
 public:
-    
+
     /// Constructs a `ReversedIMemoryStream` from a `MemoryBuffer`.
     ///
     /// @param buffer  The memory buffer.
@@ -83,4 +83,3 @@ class ReversedIMemoryStream final : public std::istream  {
 };
 
 }
-
diff --git a/backends/apple/coreml/runtime/util/json_util.cpp b/backends/apple/coreml/runtime/util/json_util.cpp
index 80605c55e8f..a7592541a49 100644
--- a/backends/apple/coreml/runtime/util/json_util.cpp
+++ b/backends/apple/coreml/runtime/util/json_util.cpp
@@ -6,7 +6,7 @@
 //
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
-#include <json_util.hpp>
+#include "json_util.hpp"
 
 #include <string>
 #include <vector>
diff --git a/backends/apple/coreml/runtime/util/objc_json_serde.mm b/backends/apple/coreml/runtime/util/objc_json_serde.mm
index 0f55d4b5919..9102046a759 100644
--- a/backends/apple/coreml/runtime/util/objc_json_serde.mm
+++ b/backends/apple/coreml/runtime/util/objc_json_serde.mm
@@ -7,7 +7,7 @@
 // Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 
-#import <objc_json_serde.h>
+#import "objc_json_serde.h"
 
 namespace executorchcoreml {
 namespace serde {

From b64182d6413429851a4e1b444e3721f6d3764ee0 Mon Sep 17 00:00:00 2001
From: Riley Dulin <dulinr@meta.com>
Date: Tue, 14 May 2024 11:45:56 -0700
Subject: [PATCH 47/62] Fix memory.view insertion except for output nodes
 (#3602)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3602

The previous implementation of ignoring `view_copy` on outputs was incorrect
in that it only checked `node.next` instead of all users of the node.
`node.next` just selects the next node in topological order, which may or
may not be the output if there is more than one output. In the case of more
than one output, the next node may not be related at all!

Check if any of the users of the node are an output instead.

Reviewed By: metascroy, mcremon-meta

Differential Revision: D57299853

fbshipit-source-id: 6a373181f6bdd58444e0c859fce320d576b7f749
---
 .../replace_view_copy_with_view_pass.py       |  8 +++++---
 exir/tests/test_passes.py                     |  9 +++++----
 exir/tests/test_remove_view_copy.py           | 19 ++++++++++++++-----
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/exir/passes/replace_view_copy_with_view_pass.py b/exir/passes/replace_view_copy_with_view_pass.py
index 8d3a2a32126..378b9332119 100644
--- a/exir/passes/replace_view_copy_with_view_pass.py
+++ b/exir/passes/replace_view_copy_with_view_pass.py
@@ -275,7 +275,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             for node in module.graph.nodes:
                 # Note: We only replace view_copy nodes that are not output, since
                 # the output pointer could be modified at runtime (T187925929)
-                if _is_view_copy(node) and node.next.op != "output":
+                if _is_view_copy(node) and all(u.op != "output" for u in node.users):
                     base, _ = node.args
                     node.target = _VIEW_OP
 
@@ -302,7 +302,9 @@ def ensures(self, graph_module: torch.fx.GraphModule) -> None:
             for node in module.graph.nodes:
                 # Note: We only replace view_copy nodes that are not output, since
                 # the output pointer could be modified at runtime (T187925929)
-                assert not (_is_view_copy(node) and node.next.op != "output")
+                assert not (
+                    _is_view_copy(node) and all(u.op != "output" for u in node.users)
+                )
                 if node.op == "call_function" and node.target == _VIEW_OP:
                     assert isinstance(node.meta["spec"], _ViewSpec)
 
@@ -317,6 +319,6 @@ def requires(self, graph_module: torch.fx.GraphModule) -> None:
             for node in module.graph.nodes:
                 # Note: We only replace view_copy nodes that are not output, since
                 # the output pointer could be modified at runtime (T187925929)
-                if _is_view_copy(node) and node.next.op != "output":
+                if _is_view_copy(node) and all(u.op != "output" for u in node.users):
                     base, size = node.args
                     assert not _is_view_copy(base)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 0377f70a150..f65ccff13b0 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1602,7 +1602,9 @@ def __init__(self):
             def forward(self, x):
                 o1 = torch.ops.aten.view_copy.default(x, [1])
                 o2 = torch.ops.aten.view_copy.default(self.parameter, [1])
-                return o1, o2
+                # view_copys at the end of a function are not replaced, so add
+                # a computation before the end of the graph.
+                return torch.ops.aten.add.Tensor(o1, o2)
 
         ep = torch.export.export(
             TestViewCopies(),
@@ -1631,10 +1633,9 @@ def forward(self, x):
         gm = gm_res.graph_module
 
         # Check after transformation
-        # Note: one view copy is not replaced, because it's the output of the graph
         FileCheck().check_count(
-            "torch.ops.aten.view_copy.default", 1, exactly=True
+            "torch.ops.aten.view_copy.default", 0, exactly=True
         ).run(gm.code)
-        FileCheck().check_count("executorch_exir_memory_view", 1, exactly=True).run(
+        FileCheck().check_count("executorch_exir_memory_view", 2, exactly=True).run(
             gm.code
         )
diff --git a/exir/tests/test_remove_view_copy.py b/exir/tests/test_remove_view_copy.py
index b3ad1f7d5a7..f64a1f19981 100644
--- a/exir/tests/test_remove_view_copy.py
+++ b/exir/tests/test_remove_view_copy.py
@@ -32,7 +32,8 @@ def forward(self, x):
         )  # removed, lifetime of mul.Tensor will be extended
         v4 = torch.ops.aten.mul.Tensor(v3, self.parameter2)
         v5 = v4.view(6, 5)  # not removed, output of the graph
-        return v5
+        v6 = v4.view(2, 15)  # not removed, output of the graph
+        return v5, v6
 
     def get_example_inputs(self):
         return (torch.rand(5, 6),)
@@ -87,10 +88,15 @@ def test_output_matches(self) -> None:
             ),
         )
 
-        out_remove = etpm_remove.exported_program().module()(*example_inputs)
-        out_no_remove = etpm_no_remove.exported_program().module()(*example_inputs)
+        out_remove_v5, out_remove_v6 = etpm_remove.exported_program().module()(
+            *example_inputs
+        )
+        out_no_remove_v5, out_no_remove_v6 = etpm_no_remove.exported_program().module()(
+            *example_inputs
+        )
 
-        self.assertTrue(torch.allclose(out_remove, out_no_remove))
+        self.assertTrue(torch.allclose(out_remove_v5, out_no_remove_v5))
+        self.assertTrue(torch.allclose(out_remove_v6, out_no_remove_v6))
 
     def test_spec(self) -> None:
         model = TestModel1()
@@ -196,7 +202,7 @@ def test_spec(self) -> None:
         self.assertEqual(plan.operators[2].name, "aten::view_copy")
 
         instructions = plan.chains[0].instructions
-        self.assertEqual(len(instructions), 6)
+        self.assertEqual(len(instructions), 7)
 
         self.assertEqual(
             instructions[0].instr_args.op_index, 0  # pyre-ignore
@@ -216,3 +222,6 @@ def test_spec(self) -> None:
         self.assertEqual(
             instructions[5].instr_args.op_index, 2  # pyre-ignore
         )  # aten:view_copy @ idx11
+        self.assertEqual(
+            instructions[6].instr_args.op_index, 2  # pyre-ignore
+        )  # aten:view_copy @ idx11

From e8a520c4f37faf378da708006f090530052fce29 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@meta.com>
Date: Tue, 14 May 2024 13:27:49 -0700
Subject: [PATCH 48/62] Fix headers search paths for nlohmann json in Core ML.
 (#3607)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3607

.

Reviewed By: kirklandsign

Differential Revision: D57348926

fbshipit-source-id: f867150138f2b8162ea51de245a980606022f018
---
 backends/apple/coreml/runtime/inmemoryfs/setup.py             | 2 +-
 .../workspace/executorchcoreml.xcodeproj/project.pbxproj      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/apple/coreml/runtime/inmemoryfs/setup.py b/backends/apple/coreml/runtime/inmemoryfs/setup.py
index 95818485ca8..c93022ed341 100644
--- a/backends/apple/coreml/runtime/inmemoryfs/setup.py
+++ b/backends/apple/coreml/runtime/inmemoryfs/setup.py
@@ -30,7 +30,7 @@
         cxx_std=cxx_std,
         extra_compile_args=["-mmacosx-version-min=10.15", "-g"],
         include_dirs=[
-            "../../third-party/nlohmann_json/single_include/nlohmann",
+            "../../third-party/nlohmann_json/single_include",
             ".",
             "../util",
         ],
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
index d8ee4ea693a..d8a5e611077 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
@@ -900,7 +900,7 @@
 					"$(SRCROOT)/../include",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
-					"$(SRCROOT)/../../third-party/nlohmann_json/single_include/nlohmann",
+					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
 					"$(SRCROOT)/../../third-party/coremltools/deps/protobuf/src",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
@@ -931,7 +931,7 @@
 					"$(SRCROOT)/../include",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
-					"$(SRCROOT)/../../third-party/nlohmann_json/single_include/nlohmann",
+					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
 					"$(SRCROOT)/../../third-party/coremltools/deps/protobuf/src",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;

From 0e7955d2263580136d85cc50ae69873ad3adcd07 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Tue, 14 May 2024 14:46:25 -0700
Subject: [PATCH 49/62] Implement `aten.linear.default` (#3594)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3594

As title.

Implementation is rather simple because the shaders just have to accumulate the `mat2` shader across the width dim rather than the height dim.

Reviewed By: yipjustin

Differential Revision: D57203869

fbshipit-source-id: 08932a75e66924a0dfb0816f8ccefa718a341dd8
---
 .../runtime/graph/ops/glsl/addmm_naive.glsl   |  11 +-
 .../runtime/graph/ops/glsl/addmm_naive.yaml   |   4 +
 .../graph/ops/glsl/addmm_optimized.glsl       |  22 ++--
 .../graph/ops/glsl/addmm_optimized.yaml       |   3 +
 .../vulkan/runtime/graph/ops/glsl/matmul.h    | 102 +++++++++-------
 .../runtime/graph/ops/glsl/matmul_naive.glsl  |  12 +-
 .../runtime/graph/ops/glsl/matmul_naive.yaml  |   4 +
 .../graph/ops/glsl/matmul_optimized.glsl      |  13 +--
 .../graph/ops/glsl/matmul_optimized.yaml      |   3 +
 .../vulkan/runtime/graph/ops/glsl/view.glsl   |  16 +--
 .../vulkan/runtime/graph/ops/impl/Linear.cpp  | 109 ++++++++++++++----
 .../vulkan/runtime/graph/ops/impl/MatMul.cpp  |  65 +++++++----
 .../vulkan/runtime/graph/ops/impl/MatMul.h    |  22 ++++
 .../vulkan/runtime/graph/ops/impl/View.cpp    |  54 ++++++++-
 backends/vulkan/test/op_tests/cases.py        |  21 ++++
 15 files changed, 330 insertions(+), 131 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/MatMul.h

diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
index abdbe24d223..dbc87eb7944 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl
@@ -10,6 +10,9 @@
 
 #define PRECISION ${PRECISION}
 
+$if MAT2_IS_TRANSPOSED:
+  #define MAT2_IS_TRANSPOSED
+
 #include "indexing_utils.h"
 #include "matmul.h"
 
@@ -45,7 +48,6 @@ void main() {
   }
 
   vec4 texel = vec4(0);
-  ivec3 mat1_pos = ivec3(0, pos.y, pos.z);
 
   $if MAT1_PACKING == "W_packed":
     $if MAT2_PACKING == "H_packed":
@@ -53,16 +55,13 @@ void main() {
       texel = matmul_naive_W_packed_H_packed(
           im_mat1,
           im_mat2,
-          mat1_pos,
-          mat2_pos,
+          pos,
           in_sizes[0]);
     $elif MAT2_PACKING == "W_packed":
-      ivec3 mat2_pos = ivec3(pos.x, 0, pos.z);
       texel = matmul_naive_W_packed_W_packed(
           im_mat1,
           im_mat2,
-          mat1_pos,
-          mat2_pos,
+          pos,
           in_sizes[0]);
     $else:
       $raise Exception("Unsupported value for MAT2_PACKING")
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
index 6861b312d5f..48db85cb56e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml
@@ -10,6 +10,7 @@ addmm_naive:
     NDIM: 3
     MAT1_PACKING: W_packed
     MAT2_PACKING: H_packed
+    MAT2_IS_TRANSPOSED: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -18,3 +19,6 @@ addmm_naive:
     - NAME: addmm_naive_W_packed_H_packed
     - NAME: addmm_naive_W_packed_W_packed
       MAT2_PACKING: W_packed
+    - NAME: linear_naive_W_packed_W_packed
+      MAT2_PACKING: W_packed
+      MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
index 2830a34290f..9d45c33704f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
@@ -10,6 +10,9 @@
 
 #define PRECISION ${PRECISION}
 
+$if MAT2_IS_TRANSPOSED:
+  #define MAT2_IS_TRANSPOSED
+
 #include "indexing_utils.h"
 #include "matmul.h"
 
@@ -31,11 +34,8 @@ layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes {
   ivec4 self_sizes;
 };
 
-layout(set = 0, binding = 7) uniform PRECISION restrict PackedDimMeta {
-  int packed_dim_size;
-  int packed_dim_size_padded;
-  int packed_dim_texel_len;
-  int packed_dim_padding;
+layout(set = 0, binding = 7) uniform PRECISION restrict InLimits {
+  ivec3 in_limits;
 };
 
 layout(set = 0, binding = 8) uniform PRECISION restrict Params {
@@ -57,8 +57,7 @@ void main() {
       im_mat2,
       pos,
       out_sizes[2],
-      packed_dim_texel_len,
-      packed_dim_padding);
+      in_limits[0]);
 
   for (int idx_c = 0; idx_c < FOUR; idx_c++) {
     for (int idx_r = 0; idx_r < FOUR; idx_r++) {
@@ -70,17 +69,16 @@ void main() {
           out_pos,
           self_sizes.x == 1,
           self_sizes.y == 1);
-      results.data[idx_c][idx_r][0] = beta * self_texel.x + alpha * results.data[idx_c][idx_r][0];
 
       // results is in transposed order w.r.t. the desired output
       imageStore(
           im_out,
           out_pos,
           vec4(
-              results.data[idx_c][idx_r][0],
-              results.data[idx_c][idx_r][1],
-              results.data[idx_c][idx_r][2],
-              results.data[idx_c][idx_r][3]));
+              beta * self_texel.x + alpha * results.data[idx_c][idx_r][0],
+              beta * self_texel.x + alpha * results.data[idx_c][idx_r][1],
+              beta * self_texel.x + alpha * results.data[idx_c][idx_r][2],
+              beta * self_texel.x + alpha * results.data[idx_c][idx_r][3]));
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
index 53352342a84..73014d440dd 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
@@ -9,9 +9,12 @@ addmm_optimized:
     DTYPE: float
     NDIM: 3
     PACKING: C_packed
+    MAT2_IS_TRANSPOSED: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
     - NAME: addmm_optimized
+    - NAME: linear_optimized
+      MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.h b/backends/vulkan/runtime/graph/ops/glsl/matmul.h
index ec00a53a649..5a7f6795879 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul.h
@@ -16,38 +16,66 @@ struct FloatMatrix {
   float data[FOUR][FOUR][FOUR];
 };
 
+#ifdef MAT2_IS_TRANSPOSED
+vec4 matmul_naive_W_packed_W_packed(
+#else
 vec4 matmul_naive_W_packed_H_packed(
-    sampler3D im_mat1,
-    sampler3D im_mat2,
-    ivec3 mat1_pos,
-    ivec3 mat2_pos,
+#endif
+    const sampler3D im_mat1,
+    const sampler3D im_mat2,
+    const ivec3 out_pos,
     const int width) {
+  ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z);
+#ifdef MAT2_IS_TRANSPOSED
+  ivec3 mat2_pos = ivec3(0, out_pos.x * 4, 0);
+#else
+  ivec3 mat2_pos = ivec3(out_pos.x * 4, 0, out_pos.z);
+#endif
+
   vec4 texel = vec4(0);
-  int K = (width + 3) / 4;
+  const int K = (width + 3) / 4;
 
   for (int i = 0; i < K; ++i) {
-    vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
-    vec4 sums = vec4(
+    const vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
+#ifdef MAT2_IS_TRANSPOSED
+    const vec4 sums = vec4(
+        dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
+        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 1, 0), 0)),
+        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 2, 0), 0)),
+        dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 3, 0), 0)));
+#else
+    const vec4 sums = vec4(
         dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
         dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(1, 0, 0), 0)),
         dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(2, 0, 0), 0)),
         dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(3, 0, 0), 0)));
+#endif
 
     texel += sums;
 
     mat1_pos.x++;
+#ifdef MAT2_IS_TRANSPOSED
+    mat2_pos.x++;
+#else
     mat2_pos.y++;
+#endif
   }
 
   return texel;
 }
 
+#ifdef MAT2_IS_TRANSPOSED
+vec4 matmul_naive_W_packed_H_packed(
+#else
 vec4 matmul_naive_W_packed_W_packed(
-    sampler3D im_mat1,
-    sampler3D im_mat2,
-    ivec3 mat1_pos,
-    ivec3 mat2_pos,
+#endif
+    const sampler3D im_mat1,
+    const sampler3D im_mat2,
+    const ivec3 out_pos,
     const int width) {
+  ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z);
+  ivec3 mat2_pos = ivec3(out_pos.x, 0, out_pos.z);
+
   vec4 texel = vec4(0);
   int K = divup4(width);
 
@@ -87,7 +115,7 @@ vec4 get_texel_W_packed(
   else if (broadcast_at_height) {
     self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
   } else {
-    self_texel = texelFetch(im_self, pos, 0);
+    self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0);
   }
 
   return self_texel;
@@ -112,7 +140,7 @@ vec4 get_texel_C_packed(
   else if (broadcast_at_height) {
     self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0);
   } else {
-    self_texel = texelFetch(im_self, pos, 0);
+    self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0);
   }
 
   return self_texel;
@@ -123,8 +151,7 @@ FloatMatrix matmul_partial_4x4(
     sampler3D im_mat2,
     const ivec3 pos,
     const int batch_size,
-    const int K_texel_len,
-    const int packed_dim_padding) {
+    const int K_texel_len) {
   FloatMatrix results;
   for (int i = 0; i < FOUR; i++) {
     for (int j = 0; j < FOUR; j++) {
@@ -133,43 +160,36 @@ FloatMatrix matmul_partial_4x4(
       }
     }
   }
-  vec4 im_mat1_partial_rows[FOUR];
-  vec4 im_mat2_partial_cols[FOUR];
+  vec4 im_mat1_partial_load[FOUR];
+  vec4 im_mat2_partial_load[FOUR];
 
   for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
     if (FOUR * pos.z + batch_idx >= batch_size) {
       break;
     }
-    // read and cache 4x4 tile of im_mat1 (4 adjacent rows)
+    int mat_z = FOUR * pos.z + batch_idx;
     for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) {
-      for (int mat1_row = 0; mat1_row < FOUR; mat1_row++) {
-        const int mat1_y = (FOUR * pos.y) + mat1_row;
-        const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, FOUR * pos.z + batch_idx);
-        im_mat1_partial_rows[mat1_row] = texelFetch(im_mat1, mat1_pos, 0);
-        // set the value out of the boundary to be 0
-        if (mat1_x == K_texel_len - 1 && packed_dim_padding > 0) {
-          for (int kk = 0; kk < packed_dim_padding; kk++) {
-            im_mat1_partial_rows[mat1_row][3 - kk] = 0;
-          }
-        }
-      }
-      // read and cache 4x4 tile of im_mat2 (4 adjacent columns)
-      for (int mat2_col = 0; mat2_col < FOUR; mat2_col++) {
-        const int mat2_x = (FOUR * pos.x) + mat2_col;
-        const ivec3 pos_rd = ivec3(mat2_x, mat1_x, FOUR * pos.z + batch_idx);
-        im_mat2_partial_cols[mat2_col] = texelFetch(im_mat2, pos_rd, 0);
-        // set the value out of the boundary to be 0
-        if (mat1_x == K_texel_len - 1 && packed_dim_padding > 0) {
-          for (int kk = 0; kk < packed_dim_padding; kk++) {
-            im_mat2_partial_cols[mat2_col][3 - kk] = 0;
-          }
-        }
+      for (int offset = 0; offset < FOUR; offset++) {
+        // read and cache 4x4 tile of im_mat1
+        const int mat1_y = (FOUR * pos.y) + offset;
+        const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, mat_z);
+        im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0);
+        // read and cache 4x4 tile of im_mat2
+#ifdef MAT2_IS_TRANSPOSED
+        const int mat2_y = (FOUR * pos.x) + offset;
+        const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
+        im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
+#else
+        const int mat2_x = (FOUR * pos.x) + offset;
+        const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, mat_z);
+        im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0);
+#endif
       }
       // perform partial dot products and add partial result to results
       for (int out_row = 0; out_row < FOUR; out_row++) {
         for (int out_col = 0; out_col < FOUR; out_col++) {
           results.data[out_row][out_col][batch_idx] +=
-              dot(im_mat1_partial_rows[out_row], im_mat2_partial_cols[out_col]);
+              dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
         }
       }
     }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
index d7e4395d04f..37a9b60f3c5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.glsl
@@ -10,6 +10,9 @@
 
 #define PRECISION ${PRECISION}
 
+$if MAT2_IS_TRANSPOSED:
+  #define MAT2_IS_TRANSPOSED
+
 #include "indexing_utils.h"
 #include "matmul.h"
 
@@ -35,24 +38,19 @@ void main() {
   }
 
   vec4 texel = vec4(0);
-  ivec3 mat1_pos = ivec3(0, pos.y, pos.z);
 
   $if MAT1_PACKING == "W_packed":
     $if MAT2_PACKING == "H_packed":
-      ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
       texel = matmul_naive_W_packed_H_packed(
           im_mat1,
           im_mat2,
-          mat1_pos,
-          mat2_pos,
+          pos,
           in_sizes[0]);
     $elif MAT2_PACKING == "W_packed":
-      ivec3 mat2_pos = ivec3(pos.x, 0, pos.z);
       texel = matmul_naive_W_packed_W_packed(
           im_mat1,
           im_mat2,
-          mat1_pos,
-          mat2_pos,
+          pos,
           in_sizes[0]);
     $else:
       $raise Exception("Unsupported value for MAT2_PACKING")
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
index 727e8b361d8..1c4db3f0ce9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive.yaml
@@ -10,6 +10,7 @@ matmul_naive:
     NDIM: 3
     MAT1_PACKING: W_packed
     MAT2_PACKING: H_packed
+    MAT2_IS_TRANSPOSED: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -18,3 +19,6 @@ matmul_naive:
     - NAME: matmul_naive_W_packed_H_packed
     - NAME: matmul_naive_W_packed_W_packed
       MAT2_PACKING: W_packed
+    - NAME: matmul_transposed_naive_W_packed_W_packed
+      MAT2_PACKING: W_packed
+      MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
index dd9c57416d2..f39bea12be3 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl
@@ -10,6 +10,9 @@
 
 #define PRECISION ${PRECISION}
 
+$if MAT2_IS_TRANSPOSED:
+  #define MAT2_IS_TRANSPOSED
+
 #include "indexing_utils.h"
 #include "matmul.h"
 
@@ -25,11 +28,8 @@ layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
   ivec4 out_sizes;
 };
 
-layout(set = 0, binding = 5) uniform PRECISION restrict PackedDimMeta {
-  int packed_dim_size;
-  int packed_dim_size_padded;
-  int packed_dim_texel_len;
-  int packed_dim_padding;
+layout(set = 0, binding = 5) uniform PRECISION restrict InLimits {
+  ivec3 in_limits;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -46,8 +46,7 @@ void main() {
       im_mat2,
       pos,
       out_sizes[2],
-      packed_dim_texel_len,
-      packed_dim_padding);
+      in_limits[0]);
 
   for (int idx_c = 0; idx_c < FOUR; idx_c++) {
     for (int idx_r = 0; idx_r < FOUR; idx_r++) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
index 7cec20e167c..ecc62f7ca3c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml
@@ -9,9 +9,12 @@ matmul_optimized:
     DTYPE: float
     NDIM: 3
     PACKING: C_packed
+    MAT2_IS_TRANSPOSED: false
   generate_variant_forall:
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
     - NAME: matmul_optimized
+    - NAME: matmul_transposed_optimized
+      MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
index 2429c841c9c..6680baad031 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/view.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
@@ -35,7 +35,7 @@ layout(constant_id = 4) const int out_packed_dim = C_DIM;
 
 void main() {
 	const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-	const ivec4 out_tensor_idx = to_tensor_idx(out_pos, out_sizes, out_packed_dim);
+	ivec4 out_tensor_idx = to_tensor_idx(out_pos, out_sizes, out_packed_dim);
 
   if (all(greaterThanEqual(out_tensor_idx, out_sizes))) {
     return;
@@ -46,13 +46,15 @@ void main() {
   // the input position from the indx.
   const ivec4 buf_indices = get_texel_nchw_buffer_ixs(out_tensor_idx, out_sizes, out_packed_dim);
 
-  VEC4_T value;
+  VEC4_T value = VEC4_T(0);
   // Need to look up the 4 values in the output texel separately.
-  for (int i =0 ; i < 4; i++) {
-    ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], in_sizes);
-    ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim);
-    VEC4_T intex = texelFetch(image_in, in_pos_elem.xyz, 0);
-    value[i] = intex[in_pos_elem.w];
+  for (int i = 0 ; i < 4; i++) {
+    if (out_tensor_idx[out_packed_dim]++ < out_sizes[out_packed_dim]) {
+      ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], in_sizes);
+      ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim);
+      VEC4_T intex = texelFetch(image_in, in_pos_elem.xyz, 0);
+      value[i] = intex[in_pos_elem.w];
+    }
   }
 
   imageStore(image_out, out_pos, value);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
index 9e4ea7a9ba0..8c963579da9 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
@@ -56,21 +57,27 @@ void resize_addmm_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
   vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
   vTensorPtr self = graph->get_tensor(args[1].refs[2]);
 
+  bool mat2_is_transposed = graph->get_bool(extra_args[0]);
+
+  const int out_cols = api::utils::val_at(-2, mat1->sizes());
+  const int out_rows = mat2_is_transposed
+      ? api::utils::val_at(-2, mat2->sizes())
+      : api::utils::val_at(-1, mat2->sizes());
+
   std::vector<int64_t> new_out_sizes(3);
   if (mat1->sizes().size() == 2) {
     new_out_sizes.resize(2);
-    new_out_sizes.at(0) = mat1->sizes().at(0);
-    new_out_sizes.at(1) = mat2->sizes().at(1);
+    new_out_sizes.at(0) = out_cols;
+    new_out_sizes.at(1) = out_rows;
   } else {
     new_out_sizes.at(0) = mat1->sizes().at(0);
-    new_out_sizes.at(1) = mat1->sizes().at(1);
-    new_out_sizes.at(2) = mat2->sizes().at(2);
+    new_out_sizes.at(1) = out_cols;
+    new_out_sizes.at(2) = out_rows;
   }
 
   out->virtual_resize(new_out_sizes);
@@ -83,19 +90,22 @@ struct Params final {
 
 void add_addmm_naive_node(
     ComputeGraph& graph,
-    const ValueRef self,
+    const ValueRef self_data,
     const ValueRef mat1,
     const ValueRef mat2_data,
     const ValueRef beta,
     const ValueRef alpha,
     const ValueRef out,
-    const Params& params) {
+    const Params& params,
+    const ValueRef mat2_is_transposed) {
+  ValueRef self = prepack_if_tensor_ref(graph, self_data, api::kWidthPacked);
   ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
 
   api::utils::uvec3 global_size = graph.extents_of(out);
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::string kernel_name("addmm_naive");
+  std::string kernel_name =
+      graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive";
   kernel_name.reserve(kShaderNameReserve);
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
@@ -119,18 +129,21 @@ void add_addmm_naive_node(
       // Specialization Constants
       {},
       // Resizing Logic
-      resize_addmm_node));
+      resize_addmm_node,
+      {mat2_is_transposed}));
 }
 
 void add_addmm_optimized_node(
     ComputeGraph& graph,
-    const ValueRef self,
+    const ValueRef self_data,
     const ValueRef mat1,
     const ValueRef mat2_data,
     const ValueRef beta,
     const ValueRef alpha,
     const ValueRef out,
-    const Params& params) {
+    const Params& params,
+    const ValueRef mat2_is_transposed) {
+  ValueRef self = prepack_if_tensor_ref(graph, self_data, api::kChannelsPacked);
   ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
 
   // Ensure mat1 is width packed
@@ -138,18 +151,24 @@ void add_addmm_optimized_node(
   auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
   viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
 
+  const bool mat2_is_transposed_val = graph.get_bool(mat2_is_transposed);
+
   // Ensure mat2 is height packed
-  ValueRef mat2_H_packed = mat2;
-  if (graph.memory_layout_of(mat2) != api::kHeightPacked) {
-    mat2_H_packed = graph.add_tensor_like(mat2, api::kHeightPacked);
-    viewFn(graph, {mat2, graph.add_none(), mat2_H_packed});
+  ValueRef mat2_packed = mat2;
+  const api::GPUMemoryLayout mat2_layout =
+      mat2_is_transposed_val ? api::kWidthPacked : api::kHeightPacked;
+  if (graph.memory_layout_of(mat2) != mat2_layout) {
+    mat2_packed = graph.add_tensor_like(mat2, mat2_layout);
+    viewFn(graph, {mat2, graph.add_none(), mat2_packed});
   }
 
   api::utils::uvec3 global_size =
       api::utils::divup_vec(graph.extents_of(out), {4, 4, 1});
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::string kernel_name("addmm_optimized");
+  std::string kernel_name = graph.get_bool(mat2_is_transposed)
+      ? "linear_optimized"
+      : "addmm_optimized";
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -159,19 +178,20 @@ void add_addmm_optimized_node(
       local_size,
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
-       {{mat1_W_packed, mat2_H_packed, self}, api::MemoryAccessType::READ}},
+       {{mat1_W_packed, mat2_packed, self}, api::MemoryAccessType::READ}},
       // Shader params buffers
       {
           graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
           graph.sizes_ubo(self),
-          graph.packed_dim_meta_ubo(mat1_W_packed),
+          graph.texture_limits_ubo(mat1_W_packed),
           graph.create_params_buffer(params),
       },
       // Specialization Constants
       {},
       // Resizing Logic
-      resize_addmm_node));
+      resize_addmm_node,
+      {mat2_is_transposed}));
 }
 
 void add_addmm_node(
@@ -181,18 +201,25 @@ void add_addmm_node(
     const ValueRef mat2,
     const ValueRef beta,
     const ValueRef alpha,
-    const ValueRef out) {
+    const ValueRef out,
+    const ValueRef mat2_is_transposed) {
   float alpha_val = 1.0f;
   float beta_val = 1.0f;
 
-  alpha_val = graph.extract_scalar<float>(alpha);
-  beta_val = graph.extract_scalar<float>(beta);
+  if (alpha != kDummyValueRef) {
+    alpha_val = graph.extract_scalar<float>(alpha);
+  }
+  if (beta != kDummyValueRef) {
+    beta_val = graph.extract_scalar<float>(beta);
+  }
 
   Params params = {alpha_val, beta_val};
   if (graph.memory_layout_of(mat1) == api::kChannelsPacked) {
-    add_addmm_optimized_node(graph, self, mat1, mat2, beta, alpha, out, params);
+    add_addmm_optimized_node(
+        graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
   } else if (graph.memory_layout_of(mat1) == api::kWidthPacked) {
-    add_addmm_naive_node(graph, self, mat1, mat2, beta, alpha, out, params);
+    add_addmm_naive_node(
+        graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
   } else {
     VK_THROW("Input should be channel packed or width packed.");
   }
@@ -200,12 +227,44 @@ void add_addmm_node(
 
 void addmm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   check_addmm_args(graph, args[0], args[1], args[2], args[3], args[4], args[5]);
+  ValueRef mat2_is_transposed = graph.add_scalar(false);
   return add_addmm_node(
-      graph, args[0], args[1], args[2], args[3], args[4], args[5]);
+      graph,
+      args[0],
+      args[1],
+      args[2],
+      args[3],
+      args[4],
+      args[5],
+      mat2_is_transposed);
+}
+
+void linear(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef input = args.at(0);
+  ValueRef weight_data = args.at(1);
+  ValueRef bias = args.at(2);
+  ValueRef out = args.at(3);
+  ValueRef weight =
+      prepack_if_tensor_ref(graph, weight_data, api::kWidthPacked);
+  ValueRef mat2_is_transposed = graph.add_scalar(true);
+  if (graph.val_is_none(bias)) {
+    return add_matmul_node(graph, input, weight, out, mat2_is_transposed);
+  } else {
+    return add_addmm_node(
+        graph,
+        bias,
+        input,
+        weight,
+        kDummyValueRef,
+        kDummyValueRef,
+        out,
+        mat2_is_transposed);
+  }
 }
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.addmm.default, addmm);
+  VK_REGISTER_OP(aten.linear.default, linear);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index 063956ad315..0bdfad1c23a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
@@ -38,20 +39,26 @@ void resize_matmul_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
   vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
 
+  bool mat2_is_transposed = graph->get_bool(extra_args[0]);
+
+  const int out_cols = api::utils::val_at(-2, mat1->sizes());
+  const int out_rows = mat2_is_transposed
+      ? api::utils::val_at(-2, mat2->sizes())
+      : api::utils::val_at(-1, mat2->sizes());
+
   std::vector<int64_t> new_out_sizes(3);
   if (mat1->sizes().size() == 2) {
     new_out_sizes.resize(2);
-    new_out_sizes.at(0) = mat1->sizes().at(0);
-    new_out_sizes.at(1) = mat2->sizes().at(1);
+    new_out_sizes.at(0) = out_cols;
+    new_out_sizes.at(1) = out_rows;
   } else {
     new_out_sizes.at(0) = mat1->sizes().at(0);
-    new_out_sizes.at(1) = mat1->sizes().at(1);
-    new_out_sizes.at(2) = mat2->sizes().at(2);
+    new_out_sizes.at(1) = out_cols;
+    new_out_sizes.at(2) = out_rows;
   }
 
   out->virtual_resize(new_out_sizes);
@@ -61,13 +68,16 @@ void add_matmul_naive_node(
     ComputeGraph& graph,
     const ValueRef mat1,
     const ValueRef mat2_data,
-    const ValueRef out) {
+    const ValueRef out,
+    const ValueRef mat2_is_transposed) {
   ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
 
   api::utils::uvec3 global_size = graph.extents_of(out);
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::string kernel_name("matmul_naive");
+  std::string kernel_name = graph.get_bool(mat2_is_transposed)
+      ? "matmul_transposed_naive"
+      : "matmul_naive";
   kernel_name.reserve(kShaderNameReserve);
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
@@ -89,14 +99,16 @@ void add_matmul_naive_node(
       // Specialization Constants
       {},
       // Resizing Logic
-      resize_matmul_node));
+      resize_matmul_node,
+      {mat2_is_transposed}));
 }
 
 void add_matmul_optimized_node(
     ComputeGraph& graph,
     const ValueRef mat1,
     const ValueRef mat2_data,
-    const ValueRef out) {
+    const ValueRef out,
+    const ValueRef mat2_is_transposed) {
   ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
 
   // Ensure mat1 is width packed
@@ -104,18 +116,24 @@ void add_matmul_optimized_node(
   auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
   viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
 
+  const bool mat2_is_transposed_val = graph.get_bool(mat2_is_transposed);
+
   // Ensure mat2 to height packed
-  ValueRef mat2_H_packed = mat2;
-  if (graph.memory_layout_of(mat2) != api::kHeightPacked) {
-    mat2_H_packed = graph.add_tensor_like(mat2, api::kHeightPacked);
-    viewFn(graph, {mat2, graph.add_none(), mat2_H_packed});
+  ValueRef mat2_packed = mat2;
+  const api::GPUMemoryLayout mat2_layout =
+      mat2_is_transposed_val ? api::kWidthPacked : api::kHeightPacked;
+  if (graph.memory_layout_of(mat2) != mat2_layout) {
+    mat2_packed = graph.add_tensor_like(mat2, mat2_layout);
+    viewFn(graph, {mat2, graph.add_none(), mat2_packed});
   }
 
   api::utils::uvec3 global_size =
       api::utils::divup_vec(graph.extents_of(out), {4, 4, 1});
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  std::string kernel_name("matmul_optimized");
+  std::string kernel_name = mat2_is_transposed_val
+      ? "matmul_transposed_optimized"
+      : "matmul_optimized";
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -125,26 +143,30 @@ void add_matmul_optimized_node(
       local_size,
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
-       {{mat1_W_packed, mat2_H_packed}, api::MemoryAccessType::READ}},
+       {{mat1_W_packed, mat2_packed}, api::MemoryAccessType::READ}},
       // Shader params buffers
       {
           graph.texture_limits_ubo(out),
           graph.sizes_ubo(out),
-          graph.packed_dim_meta_ubo(mat1_W_packed),
+          graph.texture_limits_ubo(mat1_W_packed),
       },
       // Specialization Constants
-      {}));
+      {},
+      // Resizing Logic
+      resize_matmul_node,
+      {mat2_is_transposed}));
 }
 
 void add_matmul_node(
     ComputeGraph& graph,
     const ValueRef mat1,
     const ValueRef mat2_data,
-    const ValueRef out) {
+    const ValueRef out,
+    const ValueRef mat2_is_transposed) {
   if (graph.memory_layout_of(mat1) == api::kChannelsPacked) {
-    add_matmul_optimized_node(graph, mat1, mat2_data, out);
+    add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed);
   } else if (graph.memory_layout_of(mat1) == api::kWidthPacked) {
-    add_matmul_naive_node(graph, mat1, mat2_data, out);
+    add_matmul_naive_node(graph, mat1, mat2_data, out, mat2_is_transposed);
   } else {
     VK_THROW("Input should be channel packed or width packed.");
   }
@@ -152,7 +174,8 @@ void add_matmul_node(
 
 void matmul(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   check_matmul_args(graph, args[0], args[1], args[2]);
-  return add_matmul_node(graph, args[0], args[1], args[2]);
+  const ValueRef mat2_is_transposed = graph.add_scalar(false);
+  return add_matmul_node(graph, args[0], args[1], args[2], mat2_is_transposed);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.h b/backends/vulkan/runtime/graph/ops/impl/MatMul.h
new file mode 100644
index 00000000000..38f7907f1b6
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_matmul_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef out,
+    const ValueRef mat2_is_transposed);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
index ef23110d116..b3b4dedefd5 100644
--- a/backends/vulkan/runtime/graph/ops/impl/View.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -14,7 +14,50 @@
 
 namespace vkcompute {
 
-void add_view_node(ComputeGraph& graph, ValueRef in, ValueRef out) {
+std::vector<int64_t> compute_out_sizes(
+    std::vector<int64_t> orig_sizes,
+    std::vector<int64_t>& view_sizes) {
+  std::vector<int64_t> out_sizes(view_sizes.begin(), view_sizes.end());
+  int64_t numel = 1;
+  int64_t transferred_numel = 1;
+
+  for (int i = 0; i < orig_sizes.size(); i++) {
+    numel *= orig_sizes.at(i);
+  }
+  for (int i = 0; i < view_sizes.size(); i++) {
+    if (view_sizes.at(i) > 0) {
+      transferred_numel *= view_sizes.at(i);
+    }
+  }
+  for (int i = 0; i < out_sizes.size(); i++) {
+    if (out_sizes.at(i) == -1) {
+      out_sizes.at(i) = numel / transferred_numel;
+    }
+  }
+  return out_sizes;
+}
+
+void resize_view_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+  if (extra_args[0] == kDummyValueRef || graph->val_is_none(extra_args[0])) {
+    out->virtual_resize(in->sizes());
+  } else {
+    IntListPtr view_sizes = graph->get_int_list(extra_args[0]);
+    std::vector<int64_t> out_sizes =
+        compute_out_sizes(in->sizes(), *view_sizes);
+    out->virtual_resize(out_sizes);
+  }
+}
+
+void add_view_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef sizes,
+    ValueRef out) {
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
@@ -35,13 +78,14 @@ void add_view_node(ComputeGraph& graph, ValueRef in, ValueRef out) {
       // Parameter Buffers
       {t_out->sizes_ubo(), t_in->sizes_ubo()},
       // Specialization Constants
-      {SV(t_in->gpu_memory_layout_int()), SV(t_out->gpu_memory_layout_int())}));
+      {SV(t_in->gpu_memory_layout_int()), SV(t_out->gpu_memory_layout_int())},
+      // Resizing Logic
+      resize_view_node,
+      {sizes}));
 }
 
 void view(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Note: The second argument size_ref is not used here. Since the output
-  // tensor's size have been determined during compilation.
-  return add_view_node(graph, args[0], args[2]);
+  return add_view_node(graph, args[0], args[1], args[2]);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index a1e6227a227..d115f1897fa 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -94,6 +94,26 @@ def get_addmm_inputs():
     return test_suite
 
 
+def get_linear_inputs():
+    MKN_list = [
+        (S2, M2, M1),
+        (L, L, M1),
+    ]
+
+    inputs_list = [((M, K), (N, K), None) for M, K, N in MKN_list]
+    inputs_list += [((M, K), (N, K), (N)) for M, K, N in MKN_list]
+    inputs_list += [((3, M, K), (N, K), None) for M, K, N in MKN_list]
+    inputs_list += [((3, M, K), (N, K), (N)) for M, K, N in MKN_list]
+
+    test_suite = VkTestSuite(inputs_list)
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.layouts = [
+        "api::kWidthPacked",
+        "api::kChannelsPacked",
+    ]
+    return test_suite
+
+
 def get_pool2d_inputs():
     test_suite = VkTestSuite(
         [
@@ -747,6 +767,7 @@ def get_gelu_inputs():
     "aten.addmm.default": get_addmm_inputs(),
     "aten.bmm.default": get_bmm_inputs(),
     "aten.mm.default": get_mm_inputs(),
+    "aten.linear.default": get_linear_inputs(),
     "aten.max_pool2d_with_indices.default": get_pool2d_inputs(),
     "aten.convolution.default": get_conv_inputs(),
     "aten.native_layer_norm.default": get_native_layer_norm_inputs(),

From 39f9c0fb59ea56b78ae3ffe44a287fc49acf54c0 Mon Sep 17 00:00:00 2001
From: Songhao Jia <gasoonjia@meta.com>
Date: Tue, 14 May 2024 16:11:34 -0700
Subject: [PATCH 50/62] temporary disable dim order ops in executorch/examples
 to mitigate oss ios ci issue (#3610)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3610
as title

Reviewed By: shoumikhin

Differential Revision: D57353025

fbshipit-source-id: f45ffa81a1d877238cac3068bae9ecf3b365230f
---
 examples/portable/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/portable/utils.py b/examples/portable/utils.py
index 82242f585a8..9e4a9607618 100644
--- a/examples/portable/utils.py
+++ b/examples/portable/utils.py
@@ -20,6 +20,7 @@
 
 _EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
     _check_ir_validity=True,
+    _skip_dim_order=True,  # TODO(T189114319): Reuse dim order op after solving the ios oss issue
 )
 
 

From aaa2f2e334c0707f23a95ba8684ffafbc03a634c Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@fb.com>
Date: Tue, 14 May 2024 21:47:10 -0700
Subject: [PATCH 51/62] Add a way to run c++ unittest on OSS (#3606)

Summary:
Add CMake rule for tests, and a script to invoke it.

Pull Request resolved: https://github.com/pytorch/executorch/pull/3606

Reviewed By: larryliu0820

Differential Revision: D57343746

Pulled By: kirklandsign

fbshipit-source-id: 289a37fb97c7f80cab44aa2ba30b859d1d527e59
---
 .../core/portable_type/test/CMakeLists.txt    | 46 +++++++++++++++++++
 test/run_oss_cpp_tests.sh                     | 23 ++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 runtime/core/portable_type/test/CMakeLists.txt
 create mode 100644 test/run_oss_cpp_tests.sh

diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt
new file mode 100644
index 00000000000..3ea05677c3d
--- /dev/null
+++ b/runtime/core/portable_type/test/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+project(runtime_core_portable_type_test)
+
+# Use C++11 for test.
+set(CMAKE_CXX_STANDARD 11)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+# Find prebuilt executorch library
+find_package(executorch CONFIG REQUIRED)
+
+enable_testing()
+find_package(GTest CONFIG REQUIRED)
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories
+    ${EXECUTORCH_ROOT}/..
+)
+target_include_directories(executorch INTERFACE ${_common_include_directories})
+
+set(_test_srcs optional_test.cpp executor_tensor_test.cpp half_test.cpp
+               scalar_test.cpp tensor_impl_test.cpp
+)
+
+add_executable(runtime_core_portable_type_test ${_test_srcs})
+target_link_libraries(
+  runtime_core_portable_type_test GTest::gtest GTest::gtest_main executorch
+)
+add_test(ExecuTorchTest runtime_core_portable_type_test)
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
new file mode 100644
index 00000000000..dcc8b4d27f6
--- /dev/null
+++ b/test/run_oss_cpp_tests.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+build_executorch() {
+  cmake . -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_BUILD_GTESTS=ON -Bcmake-out
+  cmake --build cmake-out -j9 --target install
+}
+
+build_and_run_test() {
+  local test_dir=$1
+  cmake "${test_dir}" -Bcmake-out/"${test_dir}" -DCMAKE_INSTALL_PREFIX=cmake-out
+  cmake --build cmake-out/"${test_dir}"
+  for t in $(cmake-out/"${test_dir}"/*test); do ./"$t"; done
+}
+
+build_executorch
+build_and_run_test runtime/core/portable_type/test/

From fb24c9d8d3eabcb9545feb7c684bc188a5311cbd Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <shoumikhin@meta.com>
Date: Tue, 14 May 2024 23:30:28 -0700
Subject: [PATCH 52/62] Fix headers search paths for nlohmann json in Core ML.
 (#3614)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3614
overriding_review_checks_triggers_an_audit_and_retroactive_review
Oncall Short Name: executorch

Differential Revision: D57368772

fbshipit-source-id: aea98235a4bf936bc462eca353d8f47463fd8a2e
---
 .../coreml/runtime/test/DatabaseTests.mm      |  4 +-
 .../runtime/test/InMemoryFileSystemTests.mm   | 38 +++++++++----------
 .../coreml/runtime/test/KeyValueStoreTests.mm | 22 +++++------
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/backends/apple/coreml/runtime/test/DatabaseTests.mm b/backends/apple/coreml/runtime/test/DatabaseTests.mm
index 9b89f20aa5a..1d66448852e 100644
--- a/backends/apple/coreml/runtime/test/DatabaseTests.mm
+++ b/backends/apple/coreml/runtime/test/DatabaseTests.mm
@@ -8,7 +8,7 @@
 #import <XCTest/XCTest.h>
 
 #import <database.hpp>
-#import <json.hpp>
+#import <nlohmann/json.hpp>
 
 @interface DatabaseTests : XCTestCase
 
@@ -58,7 +58,7 @@ - (void)testDatabaseQuery {
     XCTAssertTrue(insertStatement->bind_name("$value", std::string("1"), error));
     XCTAssertTrue(insertStatement->execute(error));
     XCTAssertTrue(database->get_row_count("TEST", error) == 1);
-    
+
     auto query = database->prepare_statement("SELECT * FROM TEST", error);
     XCTAssertTrue(query != nullptr);
     XCTAssertTrue(query->step(error));
diff --git a/backends/apple/coreml/runtime/test/InMemoryFileSystemTests.mm b/backends/apple/coreml/runtime/test/InMemoryFileSystemTests.mm
index a4ccbd94b68..226a42aaaaf 100644
--- a/backends/apple/coreml/runtime/test/InMemoryFileSystemTests.mm
+++ b/backends/apple/coreml/runtime/test/InMemoryFileSystemTests.mm
@@ -13,7 +13,7 @@
 
 #import <inmemory_filesystem_utils.hpp>
 #import <memory_stream.hpp>
-#import <json.hpp>
+#import <nlohmann/json.hpp>
 #import <json_util.hpp>
 
 using json = nlohmann::json;
@@ -25,11 +25,11 @@
     inline Content(std::string identifier, std::string value) noexcept
     :identifier(std::move(identifier)), value(std::move(value))
     {}
-    
+
     inline Content() noexcept
     :identifier(""), value("")
     {}
-    
+
     std::string identifier;
     std::string value;
 };
@@ -80,7 +80,7 @@ T from_memory_buffer(const std::shared_ptr<MemoryBuffer>& buffer) {
     for (size_t i = 0; i < length; ++i) {
         result += chars[rand() % (sizeof(chars) - 1)];
     }
-    
+
     return result;
 }
 
@@ -178,12 +178,12 @@ - (void)testWriteItemAtPath {
     Content content("abc", "xyz");
     std::shared_ptr<MemoryBuffer> buffer = to_memory_buffer(content);
     std::error_code error;
-    
+
     XCTAssertTrue(fs.make_directory({"dir1"}, InMemoryFileSystem::Attributes(), false, error));
     XCTAssertTrue(fs.make_file({"dir1", "content.json"}, buffer, InMemoryFileSystem::Attributes(), false  /*overwrite*/, error));
     XCTAssertTrue(fs.make_directory({"dir1", "dir2"}, InMemoryFileSystem::Attributes(), false, error));
     XCTAssertTrue(fs.make_file({"dir1", "dir2", "content.json"}, buffer, InMemoryFileSystem::Attributes(), false  /*overwrite*/, error));
-    
+
     NSURL *dirURL = [[NSURL fileURLWithPath:NSTemporaryDirectory()] URLByAppendingPathComponent:[NSUUID UUID].UUIDString];
     NSFileManager *fm = [[NSFileManager alloc] init];
     NSError *localError = nil;
@@ -220,7 +220,7 @@ - (void)testCreationFromFileSystem {
     NSData *data = [NSData dataWithBytesNoCopy:buffer->data() length:buffer->size() freeWhenDone:NO];
     XCTAssertTrue([data writeToURL:[dirURL URLByAppendingPathComponent:@"dir1/content.json"] atomically:YES]);
     XCTAssertTrue([data writeToURL:[dirURL URLByAppendingPathComponent:@"dir2/content.json"] atomically:YES]);
-    
+
     std::filesystem::path dirPath(dirURL.path.UTF8String);
     std::error_code error;
     auto fs = InMemoryFileSystem::make_from_directory(dirPath,
@@ -256,7 +256,7 @@ - (void)_testSerdeWithConfig:(SerdeVerificationConfig)config {
         }
         XCTAssertTrue(fs.write_item_to_disk({}, dirURL.path.UTF8String, true, error));
     }
-    
+
     // Verify serialization.
     std::shared_ptr<MemoryBuffer> buffer = nullptr;
     {
@@ -264,7 +264,7 @@ - (void)_testSerdeWithConfig:(SerdeVerificationConfig)config {
         auto fs = InMemoryFileSystem::make_from_directory(dirURL.path.UTF8String,
                                                           config.file_load_option,
                                                           error);
-    
+
         XCTAssertTrue(fs != nullptr);
         size_t length = inmemoryfs::get_buffer_size_for_serialization(*fs, {}, config.alignment);
         switch (config.file_load_option) {
@@ -272,15 +272,15 @@ - (void)_testSerdeWithConfig:(SerdeVerificationConfig)config {
                 buffer = MemoryBuffer::make_using_mmap(length);
                 break;
             }
-                
+
             default:
                 buffer = MemoryBuffer::make_using_malloc(length);
                 break;
         }
-        
+
         XCTAssertTrue(inmemoryfs::serialize(*fs, {}, config.alignment, buffer->data(), error));
     }
-    
+
     // Verify de-serialization.
     {
         auto fs = inmemoryfs::make_from_buffer(buffer);
@@ -290,7 +290,7 @@ - (void)_testSerdeWithConfig:(SerdeVerificationConfig)config {
             XCTAssertEqual(from_memory_buffer<Content>(fs->get_file_content({"test", "dir", content.identifier}, error)), content);
         }
     }
-    
+
     [fm removeItemAtURL:dirURL error:nil];
 }
 
@@ -332,7 +332,7 @@ - (void)testSerde {
         .file_base_length = 100,
         .alignment = 2 * (size_t)getpagesize(),
     });
-   
+
     for (const auto& config : configs) {
         [self _testSerdeWithConfig:config];
     }
@@ -349,7 +349,7 @@ - (void)testReadJSONObject {
         auto j = json::parse(object.value().begin(), object.value().end());
         XCTAssertEqual(j["x"], 1, "The value must match");
     }
-    
+
     {
         std::stringstream ss;
         std::string fragment("{\"x\" : 1");
@@ -357,8 +357,8 @@ - (void)testReadJSONObject {
         auto object = executorchcoreml::json::read_object_from_stream(ss);
         XCTAssertFalse(object.has_value(), "There is no closing brace, `read_json_object` must return nullopt");
     }
-    
-    
+
+
     {
         std::stringstream ss;
         std::string fragment("{\"x\" : \"\\\"1\"}xyz");
@@ -369,7 +369,7 @@ - (void)testReadJSONObject {
         std::string value = j["x"];
         XCTAssertEqual(value, std::string("\"1"), "The value must match");
     }
-    
+
     {
         std::stringstream ss;
         std::string fragment("{sdhalskjks}");
@@ -384,7 +384,7 @@ - (void)testReadJSONObject {
         }
         XCTAssertNotEqual(eptr, nullptr, "Parsing invalid json object must throw an exception");
     }
-    
+
 }
 
 @end
diff --git a/backends/apple/coreml/runtime/test/KeyValueStoreTests.mm b/backends/apple/coreml/runtime/test/KeyValueStoreTests.mm
index 81a667bc375..4d113efa43a 100644
--- a/backends/apple/coreml/runtime/test/KeyValueStoreTests.mm
+++ b/backends/apple/coreml/runtime/test/KeyValueStoreTests.mm
@@ -9,7 +9,7 @@
 #import <XCTest/XCTest.h>
 
 #import <json_key_value_store.hpp>
-#import <json.hpp>
+#import <nlohmann/json.hpp>
 
 namespace {
 using json = nlohmann::json;
@@ -24,11 +24,11 @@
     inline Entry(std::string identifier, size_t count) noexcept
     :identifier(std::move(identifier)), count(count)
     {}
-    
+
     inline Entry() noexcept
     :identifier(""), count(0)
     {}
-    
+
     inline std::string to_json_string() const noexcept {
         json j;
         to_json(j, *this);
@@ -36,12 +36,12 @@ inline Entry() noexcept
         ss << j;
         return ss.str();
     }
-    
+
     inline void from_json_string(const std::string& json_string) noexcept {
         auto j = json::parse(json_string);
         from_json(j, *this);
     }
-    
+
     std::string identifier;
     size_t count;
 };
@@ -110,12 +110,12 @@ - (void)testJSONKeyValueStore {
     std::error_code error;
     auto database = Database::make_inmemory(Database::SynchronousMode::Normal, 100, error);
     auto store = JSONKeyValueStore<int, Entry>::make(std::move(database), "test", error);
-    
+
     XCTAssertTrue(store->put(1, Entry("1", 1), error));
     auto entry1 = store->get(1, error);
     XCTAssertTrue(entry1.value().count == 1);
     XCTAssertTrue(entry1.value().identifier == "1");
-    
+
     XCTAssertTrue(store->put(2, Entry("2", 2), error));
     auto entry2 = store->get(2, error);
     XCTAssertTrue(entry2.value().count == 2);
@@ -134,7 +134,7 @@ - (void)testKVStoreTransactionCommit {
         // Commit the transaction.
         return true;
     }, Database::TransactionBehavior::Immediate, error));
-    
+
     XCTAssertTrue(store->size(error) == 2);
 }
 
@@ -150,7 +150,7 @@ - (void)testKVStoreTransactionRollback {
         // Rollback the transaction.
         return false;
     }, Database::TransactionBehavior::Immediate, error));
-    
+
     XCTAssertTrue(store->size(error) == 0);
 }
 
@@ -173,7 +173,7 @@ - (void)testKVStoreGetKeysSortedByAccessTime {
         // 1 is accessed first then 2 and then 3
         XCTAssertTrue(keys == (std::vector<int>{1, 2, 3}));
     }
-    
+
     {
         std::vector<int> keys;
         XCTAssertTrue(store->get_keys_sorted_by_access_time([&keys](int key) {
@@ -210,7 +210,7 @@ - (void)testKVStoreGetKeysSortedByAccessCount {
         // 3 is accessed 1 time, 2 is accessed 2 times, and 1 is accessed 3 times.
         XCTAssertTrue(keys == (std::vector<int>{3, 2, 1}));
     }
-    
+
     {
         std::vector<int> keys;
         XCTAssertTrue(store->get_keys_sorted_by_access_count([&keys](int key) {

From 99ec946944719f4ed20d0f5cafe4f1f8a7ebbcc3 Mon Sep 17 00:00:00 2001
From: Chris Hopman <cjhopman@meta.com>
Date: Wed, 15 May 2024 00:00:26 -0700
Subject: [PATCH 53/62] Add typed-arena to oss shim (#3576)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/3576

^

bypass-github-export-checks

Reviewed By: bigfootjon

Differential Revision: D57225582

fbshipit-source-id: 30edd8ebe70468eee5c8586060e849a53c9828d9
---
 shim/third-party/rust/Cargo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/shim/third-party/rust/Cargo.toml b/shim/third-party/rust/Cargo.toml
index 88ba5dda90f..e0e31bf578a 100644
--- a/shim/third-party/rust/Cargo.toml
+++ b/shim/third-party/rust/Cargo.toml
@@ -209,6 +209,7 @@ tracing = "0.1.22"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 triomphe = "0.1.11"
 trybuild = "1.0.56"
+typed-arena = "2.0"
 twox-hash = "1.6.1"
 unicode-segmentation = "1.7"
 uuid = { version = "1.2", features = ["v4"] }

From c2bc810c4a113ad513b05edd79127b1ba3e62a16 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Thu, 16 May 2024 00:21:50 +0530
Subject: [PATCH 54/62] Update .gitmodules

Updating .gitmodules
---
 .gitmodules | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitmodules b/.gitmodules
index 40af2980839..daed303750b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -68,3 +68,6 @@
 [submodule "third-party/ios-cmake"]
 	path = third-party/ios-cmake
 	url = https://github.com/leetal/ios-cmake
+[submodule "backends/cadence/hifi/third-party/nnlib"]
+	path = backends/cadence/hifi/third-party/nnlib
+	url = https://github.com/foss-xtensa/nnlib-hifi4.git

From 82e159e3a8e1ad80edda28322882014ab505f7f0 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Wed, 15 May 2024 12:04:57 -0700
Subject: [PATCH 55/62] adding nnlib repo

---
 .gitmodules                                         | 3 +++
 backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 backends/cadence/hifi/third-party/nnlib/nnlib-hifi4

diff --git a/.gitmodules b/.gitmodules
index daed303750b..8e67a93ad03 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -71,3 +71,6 @@
 [submodule "backends/cadence/hifi/third-party/nnlib"]
 	path = backends/cadence/hifi/third-party/nnlib
 	url = https://github.com/foss-xtensa/nnlib-hifi4.git
+[submodule "backends/cadence/hifi/third-party/nnlib/nnlib-hifi4"]
+	path = backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
+	url = https://github.com/foss-xtensa/nnlib-hifi4.git
diff --git a/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
new file mode 160000
index 00000000000..6a9ea45e23e
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
@@ -0,0 +1 @@
+Subproject commit 6a9ea45e23ef591fe207442df33a5ebe88bbe8de

From 6dfda8d27127ac50f52a25a231cb47409ece9dd8 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Thu, 16 May 2024 16:24:37 +0530
Subject: [PATCH 56/62] Update .gitmodules

Removing extra entry in .gitmodule
---
 .gitmodules | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 8e67a93ad03..e22780c8e84 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -68,9 +68,6 @@
 [submodule "third-party/ios-cmake"]
 	path = third-party/ios-cmake
 	url = https://github.com/leetal/ios-cmake
-[submodule "backends/cadence/hifi/third-party/nnlib"]
-	path = backends/cadence/hifi/third-party/nnlib
-	url = https://github.com/foss-xtensa/nnlib-hifi4.git
 [submodule "backends/cadence/hifi/third-party/nnlib/nnlib-hifi4"]
 	path = backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
 	url = https://github.com/foss-xtensa/nnlib-hifi4.git

From 8cc2bd8cc2994cce425aae3ac759ce86341ac679 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Fri, 17 May 2024 00:45:45 +0530
Subject: [PATCH 57/62] Adding build support for nnlib

---
 backends/cadence/CMakeLists.txt               | 118 ++----------------
 .../hifi/third-party/nnlib/CMakeLists.txt     |  35 ++++++
 2 files changed, 44 insertions(+), 109 deletions(-)
 create mode 100644 backends/cadence/hifi/third-party/nnlib/CMakeLists.txt

diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index f1d5ccbd2e5..14030719153 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 # Set the project name.
-project(cadence_executorch_example)
+project(cadence_backend)
 
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
@@ -21,121 +21,21 @@ endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-if(NOT PYTHON_EXECUTABLE)
-  resolve_python_executable()
-endif()
-
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
-# Find prebuilt executorch lib
-find_package(executorch CONFIG REQUIRED)
-
-add_compile_options(
-  -DSDK_DEBUGCONSOLE=1
-  -DSERIAL_PORT_TYPE_UART=1
-  -DDEBUG_CONSOLE_RX_ENABLE=0
-  -DDEBUG
-  -DCPU_MIMXRT685SFVKB_dsp
-  -DMCUXPRESSO_SDK
-  -g
-  -O0
-  -Wall
-  -fsigned-char
-  -Wno-missing-braces
-  -fmessage-length=0
-  -DPRINTF_FLOAT_ENABLE=1
-)
-
-if(NOT DEFINED NXP_SDK_ROOT_DIR)
-  message(FATAL_ERROR "NXP_SDK_ROOT_DIR is not set")
-endif()
-
-# lint_cmake: -linelength
-set(SOURCES
-    ${NXP_SDK_ROOT_DIR}/components/lists/fsl_component_generic_list.c
-    ${NXP_SDK_ROOT_DIR}/components/uart/fsl_adapter_usart.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_clock.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_common.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_common_dsp.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_flexcomm.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_gpio.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_mu.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_reset.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers/fsl_usart.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/system_MIMXRT685S_dsp.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/utilities/debug_console_lite/fsl_assert.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/utilities/debug_console_lite/fsl_debug_console.c
-    ${NXP_SDK_ROOT_DIR}/boards/evkmimxrt685/dsp_examples/mu_polling/dsp/board_hifi4.c
-    ${NXP_SDK_ROOT_DIR}/boards/evkmimxrt685/dsp_examples/mu_polling/dsp/pin_mux.c
-    ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/utilities/str/fsl_str.c
-)
-
-add_library(dsp_mu_polling_libs STATIC ${SOURCES})
-
-target_include_directories(
-  dsp_mu_polling_libs
-  PUBLIC ${NXP_SDK_ROOT_DIR}
-         ${NXP_SDK_ROOT_DIR}/components/uart
-         ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/drivers
-         ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/utilities/debug_console_lite
-         ${NXP_SDK_ROOT_DIR}/components/lists
-         ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S
-         ${NXP_SDK_ROOT_DIR}/CMSIS/Core/Include
-         ${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/utilities/str
-         ${NXP_SDK_ROOT_DIR}/boards/evkmimxrt685/dsp_examples/mu_polling/dsp
-)
-
-add_library(extension_runner_util STATIC IMPORTED)
-set_property(
-  TARGET extension_runner_util
-  PROPERTY
-    IMPORTED_LOCATION
-    "${CMAKE_CURRENT_LIST_DIR}/../../cmake-out/extension/runner_util/libextension_runner_util.a"
-)
-
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/hifi/third-party/nnlib)  
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/hifi/operators)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/hifi/kernels)
 
-# Generate the model header file
-add_custom_command(
-  OUTPUT ${CMAKE_BINARY_DIR}/model_pte.h
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/utils/gen_header.py
-          --model_path ${MODEL_PATH} --header_output_path ${CMAKE_BINARY_DIR}
-  COMMENT "Converting .pte model to header file..."
-  DEPENDS ${CMAKE_CURRENT_LIST_DIR}/utils/gen_header.py
-)
-
-add_custom_target(gen_model_header DEPENDS ${CMAKE_BINARY_DIR}/model_pte.h)
 
-add_executable(cadence_executorch_example executor_runner.cpp)
-add_dependencies(cadence_executorch_example gen_model_header)
-
-# lint_cmake: -linelength
-target_include_directories(
-  cadence_executorch_example PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
-                                    ${_common_include_directories}
+install(
+  TARGETS cadence_ops_lib
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
 )
 
-target_link_options(
-  cadence_executorch_example PRIVATE
-  -mlsp=${NXP_SDK_ROOT_DIR}/devices/MIMXRT685S/xtensa/min-rt
-)
-target_link_libraries(
-  cadence_executorch_example dsp_mu_polling_libs cadence_ops_lib
-  extension_runner_util executorch
-)
 
-add_custom_command(
-  TARGET cadence_executorch_example
-  POST_BUILD
-  COMMAND
-    ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/utils/post_compilation.py
-    ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME} ${CMAKE_BINARY_DIR}
-  COMMENT
-    "Generating .bin files that can be used to flash the DSP with. Copy over
-    the dsp_text_release.bin and dsp_data_release.bin that are generated into
-    your NXP MCUXpresso IDE workspace and flash the DSP with these binaries."
-    DEPENDS
-    ${CMAKE_CURRENT_LIST_DIR}/utils/post_compilation.py
-)
+
+                                                            
diff --git a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
new file mode 100644
index 00000000000..b7c777fa3ea
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
@@ -0,0 +1,35 @@
+include(ExternalProject)
+cmake_minimum_required(VERSION 3.10.0)
+project(cadence_nnlib)
+
+set(CMAKE_C_COMPILER xt-clang)
+set(CMAKE_CXX_COMPILER xt-clang++)
+set(CMAKE_LINKER xt-ld)
+  
+        
+add_custom_target( nnlib_target ALL COMMAND make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nnlib/build -j )
+
+add_library(xa_nnlib STATIC IMPORTED GLOBAL)
+add_dependencies(xa_nnlib nnlib_target)
+
+set_property(
+  TARGET xa_nnlib
+  PROPERTY 
+  IMPORTED_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/nnlib-hifi4/xa_nnlib/build/libxa_nnlib.a"
+)
+
+#set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+#install(
+#  TARGETS xa_nnlib
+#  DESTINATION lib
+#  DESTINATION ${_common_include_directories}
+#)
+
+
+
+
+
+
+
+
+

From 2ec6046ba502ffd1db96e4cc23949cb1647af3c9 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Fri, 17 May 2024 11:49:46 -0700
Subject: [PATCH 58/62] Adding cmakelist changes to link nnlib

---
 CMakeLists.txt                                  | 10 +++++++++-
 backends/cadence/hifi/kernels/CMakeLists.txt    |  2 ++
 .../hifi/third-party/nnlib/CMakeLists.txt       | 17 ++++-------------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b4c31a131c..42568129c02 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -417,7 +417,7 @@ target_link_libraries(executorch_no_prim_ops PRIVATE program_schema)
 # Check if dl exists for this toolchain and only then link it.
 find_library(DL_LIBRARY_EXISTS NAMES dl)
 # Check if the library was found
-if(DL_LIBRARY_EXISTS)
+if(DL_LIBRARY_EXISTS AND NOT EXECUTORCH_BUILD_CADENCE)
   target_link_libraries(executorch_no_prim_ops PRIVATE dl) # For dladdr()
 endif()
 target_include_directories(
@@ -443,7 +443,9 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(NOT EXECUTORCH_BUILD_CADENCE)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
+endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
@@ -496,6 +498,8 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
 
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
+  elseif(EXECUTORCH_BUILD_CADENCE)
+    list(APPEND _executor_runner_libs cadence_ops_lib)
   else()
     list(APPEND _executor_runner_libs portable_ops_lib)
   endif()
@@ -566,6 +570,10 @@ if(EXECUTORCH_BUILD_COREML)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
 endif()
 
+if(EXECUTORCH_BUILD_CADENCE)
+   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
+endif()
+
 if(EXECUTORCH_BUILD_PYBIND)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
 
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 9d4d456d8bc..872e62fc970 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -20,3 +20,5 @@ target_include_directories(
          ${NN_LIB_BASE_DIR}/xa_nnlib/algo/ndsp/hifi4/include/
          ${NXP_SDK_ROOT_DIR}/middleware/dsp/naturedsp/hifi4/include/
 )
+
+target_link_libraries(cadence_kernels PRIVATE xa_nnlib)
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
index b7c777fa3ea..d038d0caa1f 100644
--- a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
+++ b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
@@ -1,13 +1,9 @@
-include(ExternalProject)
+
 cmake_minimum_required(VERSION 3.10.0)
 project(cadence_nnlib)
 
-set(CMAKE_C_COMPILER xt-clang)
-set(CMAKE_CXX_COMPILER xt-clang++)
-set(CMAKE_LINKER xt-ld)
-  
         
-add_custom_target( nnlib_target ALL COMMAND make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nnlib/build -j )
+add_custom_target( nnlib_target ALL COMMAND make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build -j )
 
 add_library(xa_nnlib STATIC IMPORTED GLOBAL)
 add_dependencies(xa_nnlib nnlib_target)
@@ -15,15 +11,10 @@ add_dependencies(xa_nnlib nnlib_target)
 set_property(
   TARGET xa_nnlib
   PROPERTY 
-  IMPORTED_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/nnlib-hifi4/xa_nnlib/build/libxa_nnlib.a"
+  IMPORTED_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/nnlib-hifi4/xa_nnlib/build/xa_nnlib.a"
 )
 
-#set(_common_include_directories ${EXECUTORCH_ROOT}/..)
-#install(
-#  TARGETS xa_nnlib
-#  DESTINATION lib
-#  DESTINATION ${_common_include_directories}
-#)
+
 
 
 

From 2da285eb612474e0d4454e3e1c48791847ebf63f Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Wed, 22 May 2024 04:14:31 -0700
Subject: [PATCH 59/62] Adding CFLAG updates

---
 backends/cadence/cadence.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/cadence.cmake b/backends/cadence/cadence.cmake
index 137b178ab92..25f241f205c 100644
--- a/backends/cadence/cadence.cmake
+++ b/backends/cadence/cadence.cmake
@@ -41,8 +41,8 @@ set(CMAKE_CROSSCOMPILING TRUE)
 set(CMAKE_C_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang)
 set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++)
 
-set(CMAKE_C_FLAGS_INIT "-stdlib=libc++")
-set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++")
+set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
+set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
 set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET})
 set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld)
 add_link_options(-lm -stdlib=libc++ -Wl,--no-as-needed -static)

From 5ad8ab0ad451c5c21b100adc3de14585a7848d78 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Tue, 28 May 2024 12:12:19 -0700
Subject: [PATCH 60/62] Adding quantizer and dequantizer from nnlib

---
 backends/cadence/hifi/kernels/kernels.h       | 25 +++++++++++++++++++
 .../hifi/operators/dequantize_per_tensor.cpp  | 17 ++++++++-----
 .../hifi/operators/quantize_per_tensor.cpp    | 17 ++++++++-----
 3 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 13e0470b382..59bf4c41f65 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -12,6 +12,31 @@
 #include "stddef.h"
 #include "xa_type_def.h"
 
+                                     
+extern "C" WORD32 xa_nn_elm_quantize_f32_asym8s(WORD8 * __restrict__ p_out,
+                                     const FLOAT32 * __restrict__ p_inp,
+                                     FLOAT32 out_scale,
+                                     WORD32  out_zero_bias,
+                                     WORD32  num_elm);
+    
+/*extern "C" WORD32 xa_nn_elm_quantize_f32_asym8u(UWORD8 * __restrict__ p_out,
+                                     const FLOAT32 * __restrict__ p_inp,
+                                     FLOAT32 out_scale,
+                                     WORD32  out_zero_bias,
+                                     WORD32  num_elm); */
+    
+extern "C" WORD32 xa_nn_elm_dequantize_asym8s_f32(FLOAT32 * __restrict__ p_out,
+                                       const WORD8 * __restrict__ p_inp,
+                                       WORD32  inp_zero_bias,
+                                       FLOAT32 inp_scale,
+                                       WORD32  num_elm);
+    
+/*extern "C" WORD32 xa_nn_elm_dequantize_asym8u_f32(FLOAT32 * __restrict__ p_out,
+                                       const UWORD8 * __restrict__ p_inp,
+                                       WORD32   inp_zero_bias,
+                                       FLOAT32  inp_scale,
+                                       WORD32   num_elm);*/
+
 namespace impl {
 namespace HiFi {
 namespace kernels {
diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
index dcc4ace7898..24984a89a66 100644
--- a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
@@ -31,16 +31,21 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    impl::HiFi::kernels::dequantize<uint8_t>(
-        out_data, input_data, scale, zero_point, numel);
+#if 0 //NNLIB_OPT
+    xa_nn_elm_dequantize_asym8u_f32(out_data, input_data, zero_point, scale, numel);
+#else    
+    impl::HiFi::kernels::dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
+#endif
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    impl::HiFi::kernels::dequantize<int8_t>(
-        out_data, input_data, scale, zero_point, numel);
+#if 1 //NNLIB_OPT
+    xa_nn_elm_dequantize_asym8s_f32(out_data, input_data, zero_point, scale, numel);
+#else    
+    impl::HiFi::kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
+#endif
   } else if (input.scalar_type() == ScalarType::Int) {
     const int32_t* input_data = input.const_data_ptr<int32_t>();
-    impl::HiFi::kernels::dequantize<int32_t>(
-        out_data, input_data, scale, zero_point, numel);
+    impl::HiFi::kernels::dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());
   }
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
index ec186cc68e2..ea595d12cbc 100644
--- a/backends/cadence/hifi/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
@@ -33,16 +33,21 @@ void quantize_per_tensor_out(
 
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    impl::HiFi::kernels::quantize<uint8_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+#if 0 //NNLIB_OPT     
+    xa_nn_elm_quantize_f32_asym8u(out_data, input_data, scale, zero_point, numel);
+#else    
+    impl::HiFi::kernels::quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+#endif
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
-    impl::HiFi::kernels::quantize<int8_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+#if 1//NNLIB_OPT
+    xa_nn_elm_quantize_f32_asym8s(out_data, input_data, scale, zero_point, numel);
+#else    
+    impl::HiFi::kernels::quantize<int8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+#endif        
   } else if (out.scalar_type() == ScalarType::Int) {
     int32_t* out_data = out.mutable_data_ptr<int32_t>();
-    impl::HiFi::kernels::quantize<int32_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    impl::HiFi::kernels::quantize<int32_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type());
   }

From 6b6b80b8aa4b47e174c22fc059e09c4c3859a47a Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Wed, 29 May 2024 13:24:11 -0700
Subject: [PATCH 61/62] Moving nnlib obj and lib files to cmake build dir

---
 .../cadence/hifi/third-party/nnlib/CMakeLists.txt     | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
index d038d0caa1f..d8f2b4eb3d9 100644
--- a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
+++ b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt
@@ -3,15 +3,22 @@ cmake_minimum_required(VERSION 3.10.0)
 project(cadence_nnlib)
 
         
-add_custom_target( nnlib_target ALL COMMAND make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build -j )
+add_custom_target( nnlib_target ALL COMMAND 
+                    make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build 
+                    OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj 
+                    LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib 
+                    -j8 )
 
 add_library(xa_nnlib STATIC IMPORTED GLOBAL)
 add_dependencies(xa_nnlib nnlib_target)
 
+message("NNLIB")
+message("${CMAKE_CURRENT_BINARY_DIR}")
+
 set_property(
   TARGET xa_nnlib
   PROPERTY 
-  IMPORTED_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/nnlib-hifi4/xa_nnlib/build/xa_nnlib.a"
+  IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a"
 )
 
 

From 33286466e4f9dcebda1c04ecdfcf12bb84253dc8 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Wed, 29 May 2024 13:40:03 -0700
Subject: [PATCH 62/62] Clean up quantize and dequantize

---
 backends/cadence/hifi/operators/dequantize_per_tensor.cpp | 4 ++--
 backends/cadence/hifi/operators/quantize_per_tensor.cpp   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
index 24984a89a66..3f683a6d713 100644
--- a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
@@ -31,14 +31,14 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-#if 0 //NNLIB_OPT
+#if 0 //NNLIB_OPT (not available in nnlib)
     xa_nn_elm_dequantize_asym8u_f32(out_data, input_data, zero_point, scale, numel);
 #else    
     impl::HiFi::kernels::dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
 #endif
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-#if 1 //NNLIB_OPT
+#if NNLIB_OPT
     xa_nn_elm_dequantize_asym8s_f32(out_data, input_data, zero_point, scale, numel);
 #else    
     impl::HiFi::kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
index ea595d12cbc..3137b91f6be 100644
--- a/backends/cadence/hifi/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
@@ -33,14 +33,14 @@ void quantize_per_tensor_out(
 
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-#if 0 //NNLIB_OPT     
+#if 0 //NNLIB_OPT (not available in nnlib)     
     xa_nn_elm_quantize_f32_asym8u(out_data, input_data, scale, zero_point, numel);
 #else    
     impl::HiFi::kernels::quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
 #endif
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
-#if 1//NNLIB_OPT
+#if NNLIB_OPT
     xa_nn_elm_quantize_f32_asym8s(out_data, input_data, scale, zero_point, numel);
 #else    
     impl::HiFi::kernels::quantize<int8_t>(out_data, input_data, 1. / scale, zero_point, numel);