From df69dfd92b805bf3a4cc1c8af493f6e859dc3d48 Mon Sep 17 00:00:00 2001
From: Rushi-cad <gherderu@cadence.com>
Date: Tue, 26 Nov 2024 19:02:00 -0800
Subject: [PATCH 1/3] Integrated temporary mem alloc functionality in place of
 malloc

---
 backends/cadence/aot/functions_hifi.yaml      | 13 ++++++---
 backends/cadence/hifi/kernels/kernels.cpp     |  5 ++++
 backends/cadence/hifi/kernels/kernels.h       |  8 +++++-
 .../cadence/hifi/operators/CMakeLists.txt     |  1 +
 .../cadence/hifi/operators/op_maximum.cpp     |  2 ++
 backends/cadence/hifi/operators/op_mean.cpp   |  4 ++-
 backends/cadence/hifi/operators/op_pow.cpp    | 14 +++++++---
 backends/cadence/hifi/operators/op_where.cpp  |  6 ++--
 .../executor_runner/executor_runner.cpp       | 28 ++++++++++++++++++-
 9 files changed, 68 insertions(+), 13 deletions(-)
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 0f3e582884c..ac981d321a6 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -61,11 +61,16 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::full_out
+      
+- op: gt.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::gt_scalar_out      
 
 - op: maximum.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::maximum_out
+      kernel_name: cadence::impl::HiFi::maximum_out
 
 - op: mean.out
   kernels:
@@ -90,17 +95,17 @@
 - op: pow.Scalar_out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::pow_Scalar_out
+      kernel_name: cadence::impl::HiFi::pow_Scalar_out
 
 - op: pow.Tensor_Scalar_out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::pow_Tensor_Scalar_out
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out
 
 - op: pow.Tensor_Tensor_out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::pow_Tensor_Tensor_out
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out
 
 - op: rsqrt.out
   kernels:
diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp
index 10e5fb176e0..0934afde29f 100644
--- a/backends/cadence/hifi/kernels/kernels.cpp
+++ b/backends/cadence/hifi/kernels/kernels.cpp
@@ -20,6 +20,11 @@ memcpy(void* dst, const void* src, size_t num_bytes) {
   MEMCPY_8b(dst, src, num_bytes);
 }
 
+void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) {
+  Result<void*> temp_mem_res = ctx.allocate_temp(size);
+  return temp_mem_res.ok() ? temp_mem_res.get() : nullptr;
+}
+
 // Quantize a fp32 value to an int8_t/uint8_t value
 template <typename T>
 __attribute__((always_inline)) T
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 9a4689c17c2..e72284b99de 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -14,8 +14,12 @@
 /* For NNLIB APIs */
 #include "xa_nnlib_kernels_api.h"
 
-/* Potential NNLIB function/APIs */
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::Result;
 
+/* Potential NNLIB function/APIs */
 extern "C" WORD32 xa_nn_broadcast_32_32(
     WORD32* __restrict__ p_out,
     const int* const out_shape,
@@ -149,6 +153,8 @@ namespace impl {
 namespace HiFi {
 namespace kernels {
 
+void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size);
+
 void memcpy(void* dst, const void* src, size_t num_bytes);
 
 WORD32 matmul_asym8uxasym8u_asym8u(
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index c01dad5ce80..c66e55f0f42 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -38,6 +38,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gt.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
index 97578765cff..f85d3470e93 100644
--- a/backends/cadence/hifi/operators/op_maximum.cpp
+++ b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -23,6 +23,7 @@ using torch::executor::apply_binary_elementwise_fn;
 using torch::executor::Error;
 using torch::executor::resize_to_broadcast_target_size;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -170,3 +171,4 @@ Tensor& maximum_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index 478e10da712..cdc844ec5c7 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -125,7 +125,9 @@ Tensor& mean_dim_out(
     int scratch_size = xa_nn_reduce_getsize_nhwc(
         -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1);
 
-    void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size);
+    void* __restrict__ p_scratch_in =
+        (void* __restrict__)kernels::allocate_temp_memory(
+            ctx, scratch_size * sizeof(int));
 
     xa_nn_reduce_mean_4D_f32_f32(
         p_out,
diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
index 04533b290bf..74c24afbc0e 100644
--- a/backends/cadence/hifi/operators/op_pow.cpp
+++ b/backends/cadence/hifi/operators/op_pow.cpp
@@ -26,6 +26,7 @@ using executorch::runtime::promoteTypes;
 using torch::executor::Error;
 using torch::executor::resize_to_broadcast_target_size;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -119,9 +120,11 @@ Tensor& pow_Tensor_Tensor_out(
   if (optimized) {
     if (broadcast) {
       WORD32* __restrict__ ptr1 =
-          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+          (WORD32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
       WORD32* __restrict__ ptr2 =
-          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+          (WORD32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
 
       WORD32* __restrict__ pin1 =
           (WORD32* __restrict__)a.const_data_ptr<float>();
@@ -154,7 +157,8 @@ Tensor& pow_Tensor_Tensor_out(
       free(ptr2);
     } else if (a_is_broadcasted && (!b_is_broadcasted)) {
       FLOAT32* __restrict__ ptr1 =
-          (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32));
+          (FLOAT32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
 
       FLOAT32* __restrict__ pin1 =
           (FLOAT32* __restrict__)a.const_data_ptr<float>();
@@ -181,7 +185,8 @@ Tensor& pow_Tensor_Tensor_out(
       free(ptr1);
     } else if (b_is_broadcasted && (!a_is_broadcasted)) {
       WORD32* __restrict__ ptr1 =
-          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+          (WORD32* __restrict__)kernels::allocate_temp_memory(
+              ctx, num_elm * sizeof(int));
 
       WORD32* __restrict__ pin1 =
           (WORD32* __restrict__)b.const_data_ptr<float>();
@@ -349,3 +354,4 @@ Tensor& pow_Scalar_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp
index 06bd0bc3c9f..c4ad8177cfe 100644
--- a/backends/cadence/hifi/operators/op_where.cpp
+++ b/backends/cadence/hifi/operators/op_where.cpp
@@ -109,8 +109,10 @@ Tensor& where_out(
 
       if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] ||
           con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) {
-        void* p_scratch =
-            malloc(out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]);
+        void* p_scratch = (void*)kernels::allocate_temp_memory(
+            ctx,
+            (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) *
+                sizeof(int));
         const unsigned char* p_brd_cond = (const unsigned char*)p_scratch;
         xa_nn_broadcast_8_8(
             (WORD8* __restrict__)p_brd_cond,
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 93c150c0b90..a2476f9165a 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -30,8 +30,16 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
 
+#if __XTENSA__
+#include <stdio.h>
+#include <sys/times.h>
+#include <xtensa/sim.h>
+#endif
+
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 
+static uint8_t temp_allocator_pool[1024U * 1024U];
+
 DEFINE_string(
     model_path,
     "model.pte",
@@ -120,6 +128,10 @@ int main(int argc, char** argv) {
   MemoryAllocator method_allocator{
       MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
 
+  // Temporary memory required by kernels
+  MemoryAllocator temp_allocator{
+      MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool)};
+
   // The memory-planned buffers will back the mutable tensors used by the
   // method. The sizes of these buffers were determined ahead of time during the
   // memory-planning pasees.
@@ -144,7 +156,7 @@ int main(int argc, char** argv) {
 
   // Assemble all of the allocators into the MemoryManager that the Executor
   // will use.
-  MemoryManager memory_manager(&method_allocator, &planned_memory);
+  MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator);
 
   //
   // Load the method from the program, using the provided allocators. Running
@@ -170,8 +182,22 @@ int main(int argc, char** argv) {
       (uint32_t)inputs.error());
   ET_LOG(Info, "Inputs prepared.");
 
+#if __XTENSA__
+  struct tms start, stop;
+  xt_iss_client_command("all", "disable");
+  xt_iss_client_command("all", "enable");
+  times(&start);
+#endif
+
   // Run the model.
   Error status = method->execute();
+
+#if __XTENSA__
+  times(&stop);
+  xt_iss_client_command("all", "disable");
+  ET_LOG(Info, "Execute cycles = %ld", (stop.tms_utime - start.tms_utime));
+#endif
+
   ET_CHECK_MSG(
       status == Error::Ok,
       "Execution of method %s failed with status 0x%" PRIx32,

From a23b646b2a75fbd5f7a475ff8607b0d20ed6a7a5 Mon Sep 17 00:00:00 2001
From: Rushi-cad <gherderu@cadence.com>
Date: Tue, 26 Nov 2024 21:00:37 -0800
Subject: [PATCH 2/3] Namespace related changes

---
 backends/cadence/aot/functions_hifi.yaml       | 6 +++---
 backends/cadence/hifi/operators/op_add.cpp     | 2 ++
 backends/cadence/hifi/operators/op_minimum.cpp | 2 ++
 backends/cadence/hifi/operators/op_rsqrt.cpp   | 2 ++
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index ac981d321a6..edb34924164 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -25,7 +25,7 @@
 - op: add.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::add_out
+      kernel_name: cadence::impl::HiFi::add_out
 
 - op: bmm.out
   kernels:
@@ -80,7 +80,7 @@
 - op: minimum.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::minimum_out
+      kernel_name: cadence::impl::HiFi::minimum_out
 
 - op: mul.out
   kernels:
@@ -110,7 +110,7 @@
 - op: rsqrt.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::rsqrt_out
+      kernel_name: cadence::impl::HiFi::rsqrt_out
 
 - op: sigmoid.out
   kernels:
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
index 10e06938f2e..43cb0d8cd62 100644
--- a/backends/cadence/hifi/operators/op_add.cpp
+++ b/backends/cadence/hifi/operators/op_add.cpp
@@ -22,6 +22,7 @@ using executorch::runtime::CppTypeToScalarType;
 using executorch::runtime::KernelRuntimeContext;
 using torch::executor::Error;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -202,3 +203,4 @@ Tensor& add_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp
index fd9cfe4f95f..6f81ad5c3e3 100644
--- a/backends/cadence/hifi/operators/op_minimum.cpp
+++ b/backends/cadence/hifi/operators/op_minimum.cpp
@@ -23,6 +23,7 @@ using torch::executor::apply_binary_elementwise_fn;
 using torch::executor::Error;
 using torch::executor::resize_to_broadcast_target_size;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -169,3 +170,4 @@ Tensor& minimum_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp
index c94800aef14..13d8133355c 100644
--- a/backends/cadence/hifi/operators/op_rsqrt.cpp
+++ b/backends/cadence/hifi/operators/op_rsqrt.cpp
@@ -15,6 +15,7 @@ using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -51,3 +52,4 @@ Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
\ No newline at end of file

From 5d7aea8d1391c3d0b3512aad5222266e7d3a8e7b Mon Sep 17 00:00:00 2001
From: Rushi-cad <gherderu@cadence.com>
Date: Tue, 26 Nov 2024 21:36:11 -0800
Subject: [PATCH 3/3] Cleanup the main application

---
 .../executor_runner/executor_runner.cpp       | 22 ++-----------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index a2476f9165a..514a82c0ff0 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -30,12 +30,6 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
 
-#if __XTENSA__
-#include <stdio.h>
-#include <sys/times.h>
-#include <xtensa/sim.h>
-#endif
-
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 
 static uint8_t temp_allocator_pool[1024U * 1024U];
@@ -156,7 +150,8 @@ int main(int argc, char** argv) {
 
   // Assemble all of the allocators into the MemoryManager that the Executor
   // will use.
-  MemoryManager memory_manager(&method_allocator, &planned_memory, &temp_allocator);
+  MemoryManager memory_manager(
+      &method_allocator, &planned_memory, &temp_allocator);
 
   //
   // Load the method from the program, using the provided allocators. Running
@@ -182,22 +177,9 @@ int main(int argc, char** argv) {
       (uint32_t)inputs.error());
   ET_LOG(Info, "Inputs prepared.");
 
-#if __XTENSA__
-  struct tms start, stop;
-  xt_iss_client_command("all", "disable");
-  xt_iss_client_command("all", "enable");
-  times(&start);
-#endif
-
   // Run the model.
   Error status = method->execute();
 
-#if __XTENSA__
-  times(&stop);
-  xt_iss_client_command("all", "disable");
-  ET_LOG(Info, "Execute cycles = %ld", (stop.tms_utime - start.tms_utime));
-#endif
-
   ET_CHECK_MSG(
       status == Error::Ok,
       "Execution of method %s failed with status 0x%" PRIx32,