From 3b9dce3917b240c94ad7e7ec7156adcca8349e9c Mon Sep 17 00:00:00 2001
From: Nishak <nishak@cadence.com>
Date: Wed, 13 Nov 2024 22:17:17 -0800
Subject: [PATCH 1/3] Adding cat, full, permute_copy and relu ops

---
 backends/cadence/aot/functions_hifi.yaml      |  12 +-
 backends/cadence/hifi/kernels/CMakeLists.txt  |   2 +
 backends/cadence/hifi/kernels/kernels.h       |  19 ++
 .../cadence/hifi/operators/CMakeLists.txt     |   8 +-
 backends/cadence/hifi/operators/op_cat.cpp    | 156 +++++++++++
 backends/cadence/hifi/operators/op_full.cpp   | 101 +++++++
 .../hifi/operators/op_permute_copy.cpp        | 197 +++++++++++++
 .../hifi/operators/quantized_relu_out.cpp     |  96 +++++++
 .../hifi/third-party/nnlib/xa_nn_concat_32.c  | 172 ++++++++++++
 .../third-party/nnlib/xa_nn_transpose_32.c    | 260 ++++++++++++++++++
 10 files changed, 1015 insertions(+), 8 deletions(-)
 create mode 100644 backends/cadence/hifi/operators/op_cat.cpp
 create mode 100644 backends/cadence/hifi/operators/op_full.cpp
 create mode 100644 backends/cadence/hifi/operators/op_permute_copy.cpp
 create mode 100644 backends/cadence/hifi/operators/quantized_relu_out.cpp
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index bd1102ab0b8..ed4f6a2c1e9 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -35,7 +35,7 @@
 - op: cat.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::cat_out
+      kernel_name: impl::HiFi::cat_out
 
 - op: clone.out
   kernels:
@@ -60,7 +60,7 @@
 - op: full.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::full_out
+      kernel_name: impl::HiFi::full_out
 
 - op: maximum.out
   kernels:
@@ -85,7 +85,7 @@
 - op: permute_copy.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::permute_copy_out
+      kernel_name: impl::HiFi::permute_copy_out
 
 - op: pow.Scalar_out
   kernels:
@@ -155,7 +155,6 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out
 
-
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -165,3 +164,8 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_linear_out
+
+- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_out
\ No newline at end of file
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 3d321443f8b..549371255d9 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -10,6 +10,7 @@ add_library(
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
@@ -18,6 +19,7 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
 )
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 7233fe6c29f..9a4689c17c2 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -23,6 +23,16 @@ extern "C" WORD32 xa_nn_broadcast_32_32(
     const int* const in_shape,
     int num_dims);
 
+extern "C" WORD32 xa_nn_concat_32_32(
+    WORD32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const WORD32** pp_inps,
+    const WORD32* const* pp_inps_shape,
+    WORD32 num_out_dims,
+    WORD32 num_inp,
+    WORD32 num_inp_dims,
+    WORD32 axis);
+
 extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -125,6 +135,15 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
     WORD32 num_axis_dims,
     void* __restrict__ p_scratch_in);
 
+extern "C" WORD32 xa_nn_transpose_32_32(
+    WORD32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const WORD32* __restrict__ p_inp,
+    const WORD32* const p_inp_shape,
+    const WORD32* __restrict__ p_permute_vec,
+    WORD32 num_out_dims,
+    WORD32 num_inp_dims);
+
 namespace cadence {
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 6d21c4b49a7..c01dad5ce80 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -21,11 +21,14 @@ endif()
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
@@ -33,11 +36,8 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
@@ -71,7 +71,7 @@ target_include_directories(
 # Custom ops that are needed to run the test model.
 add_library(
   custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
-             "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
+             "quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp
new file mode 100644
index 00000000000..7e0031efd5e
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_cat.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <cstring>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::getLeadingDims;
+using executorch::runtime::getTrailingDims;
+using executorch::runtime::resize_tensor;
+using executorch::runtime::tensors_have_same_dim_order;
+using torch::executor::check_cat_args;
+using torch::executor::Error;
+using torch::executor::get_cat_out_target_size;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+Tensor& cat_out(
+    RuntimeContext& ctx,
+    exec_aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out) {
+  constexpr auto name = "cat.out";
+  constexpr int kNnlibMaxDim = 16;
+
+  bool optimized = true;
+
+  if (out.scalar_type() != ScalarType::Float)
+    optimized = false;
+
+  if (optimized) {
+    WORD32 num_inp = tensors.size();
+    WORD32 num_inp_dims = out.dim();
+    WORD32 num_out_dims = num_inp_dims;
+    WORD32 axis = dim;
+
+    WORD32 inp_shape[kNnlibMaxDim][kNnlibMaxDim];
+    WORD32 p_out_shape[kNnlibMaxDim];
+
+    WORD32* ptr_shape[kNnlibMaxDim];
+    const WORD32* ptr[kNnlibMaxDim];
+
+    int k = 0;
+    for (int i = 0; i < num_inp; i++) {
+      if (tensors[i].numel() == 0)
+        continue;
+      ptr[k] = (const WORD32*)tensors[i].const_data_ptr<float>();
+      for (int j = 0; j < num_inp_dims; j++) {
+        inp_shape[k][j] = tensors[i].size(j);
+      }
+      ptr_shape[k] = inp_shape[k];
+      k++;
+    }
+
+    num_inp = k;
+
+    for (int i = 0; i < num_out_dims; i++) {
+      p_out_shape[i] = out.size(i);
+    }
+
+    const WORD32** pp_inps = &ptr[0];
+
+    WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();
+
+    const WORD32* const* pp_inps_shape = (const WORD32* const*)&ptr_shape[0];
+
+    WORD32 ret_val = xa_nn_concat_32_32(
+        p_out,
+        p_out_shape,
+        pp_inps,
+        pp_inps_shape,
+        num_out_dims,
+        num_inp,
+        num_inp_dims,
+        axis);
+
+    ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
+
+    return out;
+  }
+
+  if (dim < 0) {
+    dim += out.dim();
+  }
+
+  ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out);
+
+  Tensor::SizesType
+      expected_out_size[executorch::runtime::kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // Special handling when all inputs are 1D-empty tensors for aten consistency
+  // In that case, just return an 1D-empty tensor without checking dim
+  bool all_1d_empty = true;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
+      all_1d_empty = false;
+      break;
+    }
+  }
+  if (all_1d_empty) {
+    return out;
+  }
+
+  const size_t outer = getLeadingDims(out, dim);
+  const size_t dim_stride = getTrailingDims(out, dim);
+  const size_t ninputs = tensors.size();
+
+  const auto out_type = out.scalar_type();
+  ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
+    CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
+    for (size_t i = 0; i < outer; ++i) {
+      for (size_t j = 0; j < ninputs; ++j) {
+        const auto in_type = tensors[j].scalar_type();
+        ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
+          if (tensors[j].numel() == 0) {
+            return;
+          }
+          size_t inner = tensors[j].size(dim) * dim_stride;
+          const CTYPE_IN* const in_ptr =
+              tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;
+
+          for (size_t k = 0; k < inner; ++k) {
+            out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
+          }
+          out_ptr += inner;
+        });
+      }
+    }
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp
new file mode 100644
index 00000000000..0afb2152380
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_full.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <stdio.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using exec_aten::IntArrayRef;
+using exec_aten::RuntimeContext;
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::Error;
+using torch::executor::native::utils::extract_scalar;
+using torch::executor::native::utils::get_scalar_dtype;
+
+Tensor& full_out(
+    RuntimeContext& ctx,
+    const IntArrayRef sizes,
+    const Scalar& fill_value,
+    Tensor& out) {
+  (void)ctx;
+
+  ScalarType val_type = get_scalar_dtype(fill_value);
+  ScalarType out_type = out.scalar_type();
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, sizes) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  constexpr auto name = "full.out";
+
+  bool optimized = 0;
+  if (out_type == ScalarType::Long || out_type == ScalarType::Float ||
+      out_type == ScalarType::Byte || out_type == ScalarType::Char)
+    optimized = 1;
+    
+  if(out_type != val_type)
+    optimized = 0;
+
+  if (optimized) {
+    if (out_type == ScalarType::Long) {
+      int* data_out = out.mutable_data_ptr<int>();
+      int val;
+      extract_scalar(fill_value, &val);
+      for (size_t i = 0; i < out.numel(); ++i) {
+        data_out[i] = val;
+      }
+    } else if (out_type == ScalarType::Float) {
+      float* data_out = out.mutable_data_ptr<float>();
+      float val;
+      extract_scalar(fill_value, &val);
+
+      WORD32 ret_val = xa_nn_memset_f32_f32(
+      data_out,
+      val,
+      out.numel());
+
+      ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
+
+    } else if (out_type == ScalarType::Byte || out_type == ScalarType::Char) {
+      char* data_out = out.mutable_data_ptr<char>();
+      int val;
+      extract_scalar(fill_value, &val);
+      memset((void*)data_out, val, out.numel());
+    }
+    return out;
+  }
+
+  ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, name, CTYPE_VAL, [&] {
+    CTYPE_VAL val;
+    extract_scalar(fill_value, &val);
+
+    ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, name, CTYPE_OUT, [&] {
+      CTYPE_OUT val_casted = static_cast<CTYPE_OUT>(val);
+      auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
+      for (size_t i = 0; i < out.numel(); ++i) {
+        data_out[i] = val_casted;
+      }
+    });
+  });
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp
new file mode 100644
index 00000000000..37b0aa8b058
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_permute_copy.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::ScalarType;
+using exec_aten::SizesType;
+using exec_aten::Tensor;
+using executorch::runtime::IntArrayRef;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::kTensorDimensionLimit;
+using executorch::runtime::resize_tensor;
+using executorch::runtime::tensors_have_same_dim_order;
+using torch::executor::check_permute_copy_args;
+using torch::executor::Error;
+using torch::executor::get_permute_copy_out_target_size;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+namespace {
+
+void increment_coordinate_permuted(
+    const Tensor& tensor,
+    size_t* const coordinate,
+    IntArrayRef dims) {
+  for (int i = dims.size() - 1; i >= 0; i--) {
+    size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim();
+    coordinate[d]++;
+    if (coordinate[d] == tensor.size(d)) {
+      coordinate[d] = 0;
+    } else {
+      return;
+    }
+  }
+}
+
+} // namespace
+
+Tensor& permute_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef dims,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_permute_copy_out_target_size(
+      in, dims, expected_out_size, &expected_out_dim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  const auto in_type = out.scalar_type();
+
+  constexpr auto name = "permute_copy.out";
+  constexpr int kNnlibMaxDim = 16;
+
+  bool optimized = false;
+
+  if (out.scalar_type() == ScalarType::Float)
+    optimized = true;
+  else if (out.scalar_type() == ScalarType::Char)
+    optimized = true;
+  else if (out.scalar_type() == ScalarType::Byte)
+    optimized = true;
+
+  if (in.dim() > kNnlibMaxDim)
+    optimized = false;
+
+  if (optimized) {
+    if (in_type == ScalarType::Float) {
+      WORD32* p_inp = (WORD32*)in.const_data_ptr<float>();
+      WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();
+
+      WORD32 num_inp_dims = in.dim();
+      WORD32 num_out_dims = num_inp_dims;
+
+      WORD32 p_inp_shape[kNnlibMaxDim];
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_permute_vec[kNnlibMaxDim];
+
+      for (int i = 0; i < num_inp_dims; i++) {
+        p_inp_shape[i] = in.size(i);
+        p_out_shape[i] = in.size(dims[i]);
+        p_permute_vec[i] = dims[i];
+      }
+
+      WORD32 val = xa_nn_transpose_32_32(
+          p_out,
+          p_out_shape,
+          p_inp,
+          p_inp_shape,
+          p_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      return out;
+    } else if (in_type == ScalarType::Char) {
+      WORD8* p_inp = (WORD8*)in.const_data_ptr<char>();
+      WORD8* p_out = (WORD8*)out.mutable_data_ptr<char>();
+
+      WORD32 num_inp_dims = in.dim();
+      WORD32 num_out_dims = num_inp_dims;
+
+      WORD32 p_inp_shape[kNnlibMaxDim];
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_permute_vec[kNnlibMaxDim];
+
+      for (int i = 0; i < num_inp_dims; i++) {
+        p_inp_shape[i] = in.size(i);
+        p_out_shape[i] = in.size(dims[i]);
+        p_permute_vec[i] = dims[i];
+      }
+
+      WORD32 val = xa_nn_transpose_8_8(
+          p_out,
+          p_out_shape,
+          p_inp,
+          p_inp_shape,
+          p_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      ET_KERNEL_CHECK(ctx, val == 0, Internal, out);
+
+    } else if (in_type == ScalarType::Byte) {
+      WORD8* p_inp = (WORD8*)in.const_data_ptr<uint8_t>();
+      WORD8* p_out = (WORD8*)out.mutable_data_ptr<uint8_t>();
+
+      WORD32 num_inp_dims = in.dim();
+      WORD32 num_out_dims = num_inp_dims;
+
+      WORD32 p_inp_shape[kNnlibMaxDim];
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_permute_vec[kNnlibMaxDim];
+
+      for (int i = 0; i < num_inp_dims; i++) {
+        p_inp_shape[i] = in.size(i);
+        p_out_shape[i] = in.size(dims[i]);
+        p_permute_vec[i] = dims[i];
+      }
+
+      WORD32 val = xa_nn_transpose_8_8(
+          p_out,
+          p_out_shape,
+          p_inp,
+          p_inp_shape,
+          p_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      ET_KERNEL_CHECK(ctx, val == 0, Internal, out);
+    }
+    return out;
+  }
+
+  size_t in_coord[kTensorDimensionLimit] = {0};
+  size_t trailing_dims_memo[kTensorDimensionLimit];
+  executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo);
+
+  // in and out must be the same dtype
+  ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] {
+    const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+    CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+    for (size_t i = 0; i < out.numel(); ++i) {
+      out_data[i] =
+          in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo(
+              in, in_coord, trailing_dims_memo)];
+      increment_coordinate_permuted(in, in_coord, dims);
+    }
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp
new file mode 100644
index 00000000000..3ddf18c7411
--- /dev/null
+++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using Tensor = exec_aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = exec_aten::ScalarType;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+template <typename T>
+void quantized_relu_(
+    const Tensor& input,
+    const Tensor& in_zero_point,
+    const int64_t out_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& output) {
+  T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
+  const T* __restrict__ in = input.const_data_ptr<T>();
+  T* __restrict__ out = output.mutable_data_ptr<T>();
+
+  const int32_t* __restrict__ out_multiplier_data =
+      out_multiplier.const_data_ptr<int32_t>();
+  const int32_t* __restrict__ out_shift_data =
+      out_shift.const_data_ptr<int32_t>();
+
+  // Compute the out_scale from out_multiplier and out_shift
+  const float out_scale =
+      -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);
+
+  for (size_t i = 0, e = input.numel(); i < e; ++i) {
+    const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0;
+    out[i] = kernels::quantize<T>(temp, out_scale, out_zero_point);
+  }
+}
+
+void quantized_relu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_zero_point,
+    const int64_t out_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& output) {
+    if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    const uint8_t *p_in = input.const_data_ptr<uint8_t>();
+    uint8_t *p_out =output.mutable_data_ptr<uint8_t>();
+    uint8_t q_zero_point = in_zero_point.const_data_ptr<uint8_t>()[0];
+    
+    WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u( p_out,
+                                      p_in,
+                                      (int)q_zero_point,
+                                      (int)255,
+                                      input.numel());
+									  ret_val = 5;
+
+    ET_CHECK_MSG(
+        ret_val == 0,
+        "An internal error occured");
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    const int8_t *p_in = input.const_data_ptr<int8_t>();
+    int8_t *p_out = output.mutable_data_ptr<int8_t>();
+    int8_t q_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
+    
+    WORD32 ret_val = xa_nn_vec_activation_min_max_8_8( p_out,
+                                      p_in,
+                                      (int)q_zero_point,
+                                      (int)128,
+                                      input.numel());
+
+    ET_CHECK_MSG(
+        ret_val == 0,
+        "An internal error occured");
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
new file mode 100644
index 00000000000..244f404d2ea
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
@@ -0,0 +1,172 @@
+#include "xa_type_def.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_kernels_api.h"
+#include "xa_nnlib_common_macros.h"
+#include "xa_nnlib_err_chk.h"
+#include "xa_nnlib_common.h"
+
+WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out
+                        ,const WORD32 *const p_out_shape
+                        ,const WORD32 **pp_inps
+                        ,const WORD32 *const *pp_inps_shape
+                        ,WORD32 num_out_dims
+                        ,WORD32 num_inp
+                        ,WORD32 num_inp_dims
+                        ,WORD32 axis)
+{
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(pp_inps, -1);
+  XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1);
+  //Validate Arguments
+  XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1);
+  XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1);
+  XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1);
+  XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1);
+
+  int i = 0, j = 0;
+  for(i = 0; i < num_out_dims; i++)
+  {
+    XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1);
+  }
+
+  if(axis < 0)
+    axis = num_out_dims + axis;
+
+  WORD32 concat_size = 0;
+  for (i = 0; i < num_inp; i++)
+  {
+    XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1);
+    XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1);
+    XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1);
+#pragma loop_count min=1
+    for(j = 0; j < num_out_dims; j++)
+    {
+      XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1);
+    }
+    
+    XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1);
+    concat_size += pp_inps_shape[i][axis];
+  }
+
+  XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1);
+
+  //Calculate outer and inner size for axis
+  WORD32 outer_size = 1;
+#pragma no_simd
+  for(int i = 0; i < axis; i++)
+  {
+    outer_size *= p_out_shape[i];
+  }
+
+  WORD32 base_inner_size = 1;
+#pragma no_simd
+  for(int i = axis + 1; i < num_out_dims; i++)
+  {
+    base_inner_size *= p_out_shape[i];
+  }
+
+  WORD32 *ptmp_out = p_out;
+  for(int i = 0; i < num_inp; i++)
+  {
+    const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size;
+    WORD32 *output_ptr = ptmp_out;
+    const WORD32* input_ptr = pp_inps[i];
+
+    if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0)
+      && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0))
+    {
+      if(copy_size <= 8)
+      {
+        const ae_f32 *pae_inp = (const ae_f32 *)input_ptr;
+        for(int k = 0; k < outer_size; k++)
+        {
+          ae_f32 *pae_out = (ae_f32 *)output_ptr;
+#pragma concurrent
+#pragma no_simd
+          for(int ic = 0; ic < copy_size; ic++)
+          {
+            *pae_out++ = *pae_inp++;
+          }
+          output_ptr += concat_size * base_inner_size;
+        }
+      }
+      else
+      {
+        for(int k = 0; k < outer_size; k++)
+        {
+          const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr;
+          ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr;
+          ae_valign inp_a, out_a;
+          inp_a = AE_LA64_PP(pae_inp);
+          out_a = AE_ZALIGN64();
+          for(int ic = 0; ic < (copy_size >> 1); ic++)
+          {
+            ae_int32x2 d0;
+            AE_LA32X2_IP(d0, inp_a, pae_inp);
+            AE_SA32X2_IP(d0, out_a, pae_out);
+          }
+          AE_SA64POS_FP(out_a, pae_out);
+          const ae_f32 *puae_inp = (const ae_f32 *)pae_inp;
+          ae_f32 *puae_out = (ae_f32 *)pae_out;
+#pragma concurrent
+          for(int ic = 0; ic < (copy_size & 1); ic++)
+          {
+            puae_out[copy_size - 1] = puae_inp[copy_size - 1];
+          }
+          input_ptr += copy_size;
+          output_ptr += concat_size * base_inner_size;
+        }
+      }
+    }
+    else
+    {
+      if(copy_size <= 6)
+      {
+        for(int k = 0; k < outer_size; k++)
+        {
+#pragma concurrent
+#pragma no_unroll
+          for(int ic = 0; ic < copy_size; ic++)
+          {
+            output_ptr[ic] = *input_ptr++;
+          }
+          output_ptr += concat_size * base_inner_size;
+        }
+      }
+      else
+      {
+        for(int k = 0; k < outer_size; k++)
+        {
+          const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr;
+          ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr;
+          ae_valign inp_a, out_a;
+          inp_a = AE_LA64_PP(pae_inp);
+          out_a = AE_ZALIGN64();
+
+#pragma concurrent
+          for(int ic = 0; ic < copy_size >> 1; ic++)
+          {
+            ae_int32x2 d0;
+            AE_LA32X2_IP(d0, inp_a, pae_inp);
+            AE_SA32X2_IP(d0, out_a, pae_out);
+          }
+          AE_SA64POS_FP(out_a, pae_out);
+          
+          for(int ic = 0; ic < (copy_size & 1); ic++)
+          {
+            output_ptr[copy_size - 1] = input_ptr[copy_size - 1];
+          }
+          input_ptr += copy_size;
+          output_ptr += concat_size * base_inner_size;
+        }
+      }
+    }
+    ptmp_out += copy_size;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
new file mode 100644
index 00000000000..e7b80e3a1d9
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
@@ -0,0 +1,260 @@
+#include "xa_nnlib_common.h"
+#include "stdio.h"
+/*
+ * Currently only supports upto 5D input tensors.
+ * 1/2/3/4 D input tensors will be scaled up to 5D.
+ * For example, 2x3 -> 1x1x1x2x3.
+ */
+
+WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
+                    ,const WORD32 *const p_out_shape
+                    ,const WORD32 * __restrict__ p_inp
+                    ,const WORD32 *const p_inp_shape
+                    ,const WORD32 * __restrict__ p_permute_vec
+                    ,WORD32 num_out_dims
+                    ,WORD32 num_inp_dims)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1);
+
+  /* Invalid input checks */
+  XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1);
+  XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1);
+
+  int itr = 0;
+  for(itr=0; itr < num_inp_dims; itr++)
+  {
+    XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1);
+  }
+  for(itr=0; itr < num_out_dims; itr++)
+  {
+    XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1);
+  }
+
+
+  /* Output shape provided must be correct based on input
+   * shape and permute values */
+  for(itr=0; itr < num_out_dims; itr++)
+  {
+    int output_dim = p_out_shape[itr];
+    int expected_dim = p_inp_shape[p_permute_vec[itr]];
+    XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1);
+  }
+
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1);
+
+  /* Shift all dim with 1 in the outer part */
+  int eff_output_shape[5];
+  int eff_permute_vec[5];
+
+  for(int i = 0; i < num_out_dims; i++)
+  {
+    eff_output_shape[i] = p_out_shape[i];
+    eff_permute_vec[i] = p_permute_vec[i];
+  }
+
+  int one_i=num_out_dims-1, non_one_i=num_out_dims-1;
+  while(one_i > 0 && non_one_i >=0){
+    while(one_i > 0 && eff_output_shape[one_i]!=1){
+      one_i--;
+    }
+    non_one_i = one_i;
+    while(non_one_i >= 0 && eff_output_shape[non_one_i]==1)
+    {
+      non_one_i--;
+    }
+    if(one_i > 0 && non_one_i >=0){
+      int temp;
+      /*swap output_shape*/
+      {
+        temp = eff_output_shape[one_i];
+        eff_output_shape[one_i] = eff_output_shape[non_one_i];
+        eff_output_shape[non_one_i] = temp;
+      }
+      /*swap permute_vec*/
+      {
+        temp = eff_permute_vec[one_i];
+        eff_permute_vec[one_i] = eff_permute_vec[non_one_i];
+        eff_permute_vec[non_one_i] = temp;
+      }
+
+    }
+  }
+
+  /* Promoting lesser dim tensors to 5D tensors.
+   * Also updating the permute_vec and shapes as needed for optimization */
+  int p_5D_inp_shape[5] = {1, 1, 1, 1, 1};
+  int p_5D_out_shape[5] = {1, 1, 1, 1, 1};
+  int p_5D_permute_vec[5] = {0, 1, 2, 3, 4};
+
+  /* Check if any inner inp dimension is same in the output */
+  int last_dim_same = 1, last_n_same_dim = 0;
+  itr = num_inp_dims - 1;
+  while(itr >= 0)
+  {
+    last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim;
+    last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0;
+    itr--;
+  }
+
+  int dims_added = 5 - num_inp_dims;
+  itr = num_inp_dims - 1;
+  int same_count = last_n_same_dim;
+  int count = 4;
+  while(itr >= 0)
+  {
+    p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr];
+    p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr];
+    same_count--;
+    itr--;
+    count = (same_count > 0) ? count : count - 1;
+  }
+
+  itr = num_inp_dims - 1;
+  same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0;
+  count = 4;
+  while(itr >= 0)
+  {
+    p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added;
+    same_count--;
+    itr--;
+    count--;
+  }
+
+  int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4;
+  int inp_dim1, inp_dim2, inp_dim3, inp_dim4;
+  int inp_stride[5];
+
+  out_dim0 = p_5D_out_shape[0];
+  out_dim1 = p_5D_out_shape[1];
+  out_dim2 = p_5D_out_shape[2];
+  out_dim3 = p_5D_out_shape[3];
+  out_dim4 = p_5D_out_shape[4];
+
+  inp_dim1 = p_5D_inp_shape[1];
+  inp_dim2 = p_5D_inp_shape[2];
+  inp_dim3 = p_5D_inp_shape[3];
+  inp_dim4 = p_5D_inp_shape[4];
+
+  inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4;
+  inp_stride[1] = inp_dim2*inp_dim3*inp_dim4;
+  inp_stride[2] = inp_dim3*inp_dim4;
+  inp_stride[3] = inp_dim4;
+  inp_stride[4] = 1;
+
+  if(last_n_same_dim)
+  {
+    int itr0, itr1, itr2, itr3, itr4;
+    WORD32 *p_inp0 = (WORD32 *)p_inp;
+    for(itr0 = 0; itr0 < out_dim0; itr0++)
+    {
+      WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
+#pragma loop_count min=1
+      for(itr1 = 0; itr1 < out_dim1; itr1++)
+      {
+        WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
+#pragma loop_count min=1
+        for(itr2 = 0; itr2 < out_dim2; itr2++)
+        {
+          WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
+#pragma loop_count min=1
+          for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
+          {
+            WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
+            if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0))
+            {
+              ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
+              ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
+              ae_int32x2 d0;
+              for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
+              {
+                AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32));
+                AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32));
+              }
+              ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
+              ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
+#pragma loop_count max=3
+              for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
+              {
+                puae_o[itr4] = puae_i[itr4];
+              }
+            }
+            else
+            {
+              ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
+              ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
+              ae_valign a_inp = AE_LA64_PP(pae_i);
+              ae_valign a_out = AE_ZALIGN64();
+              ae_int32x2 d0;
+              for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
+              {
+                AE_LA32X2_IP(d0, a_inp, pae_i);
+                AE_SA32X2_IP(d0, a_out, pae_o);
+              }
+              AE_SA64POS_FP(a_out, pae_o);
+              ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
+              ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
+#pragma loop_count max=3
+              for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
+              {
+                puae_o[itr4] = puae_i[itr4];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    int itr0, itr1, itr2, itr3, itr4;
+    WORD32 *p_inp0 = (WORD32 *)p_inp;
+    for(itr0 = 0; itr0 < out_dim0; itr0++)
+    {
+      WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
+      for(itr1 = 0; itr1 < out_dim1; itr1++)
+      {
+        WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
+        for(itr2 = 0; itr2 < out_dim2; itr2++)
+        {
+          WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
+          for(itr3 = 0; itr3 < out_dim3; itr3++)
+          {
+            WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
+
+            ae_valign a_out = AE_ZALIGN64();
+            for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
+            {
+              ae_int32x2 d0, d1;
+              ae_int32x2 tmp0;
+
+              AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
+              AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2);
+
+              tmp0 = AE_SEL32_HH(d0, d1);
+
+              AE_SA32X2_IP(tmp0, a_out, (ae_int32x2 *)p_out);
+            }
+            AE_SA64POS_FP(a_out, p_out);
+#pragma loop_count max=3
+            for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
+            {
+              *p_out++ = *p_inp4;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}
\ No newline at end of file

From 495c1525293bed746be05b523c3fe2e46cc2bc3f Mon Sep 17 00:00:00 2001
From: Nishak <nishak@cadence.com>
Date: Thu, 14 Nov 2024 06:37:41 -0800
Subject: [PATCH 2/3] clean up

---
 backends/cadence/aot/functions_hifi.yaml      |  8 ++--
 backends/cadence/hifi/operators/op_cat.cpp    |  2 +
 backends/cadence/hifi/operators/op_full.cpp   | 19 +++++-----
 .../hifi/operators/op_permute_copy.cpp        | 15 ++++----
 .../hifi/operators/quantized_relu_out.cpp     | 38 +++++++------------
 5 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index ed4f6a2c1e9..0f3e582884c 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -35,7 +35,7 @@
 - op: cat.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::cat_out
+      kernel_name: cadence::impl::HiFi::cat_out
 
 - op: clone.out
   kernels:
@@ -60,7 +60,7 @@
 - op: full.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::full_out
+      kernel_name: cadence::impl::HiFi::full_out
 
 - op: maximum.out
   kernels:
@@ -70,7 +70,7 @@
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out   
+      kernel_name: cadence::impl::HiFi::mean_dim_out
 
 - op: minimum.out
   kernels:
@@ -85,7 +85,7 @@
 - op: permute_copy.out
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::permute_copy_out
+      kernel_name: cadence::impl::HiFi::permute_copy_out
 
 - op: pow.Scalar_out
   kernels:
diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp
index 7e0031efd5e..1a628924457 100644
--- a/backends/cadence/hifi/operators/op_cat.cpp
+++ b/backends/cadence/hifi/operators/op_cat.cpp
@@ -23,6 +23,7 @@ using torch::executor::check_cat_args;
 using torch::executor::Error;
 using torch::executor::get_cat_out_target_size;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -154,3 +155,4 @@ Tensor& cat_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp
index 0afb2152380..47804a64f45 100644
--- a/backends/cadence/hifi/operators/op_full.cpp
+++ b/backends/cadence/hifi/operators/op_full.cpp
@@ -11,6 +11,7 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <stdio.h>
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -44,13 +45,13 @@ Tensor& full_out(
 
   constexpr auto name = "full.out";
 
-  bool optimized = 0;
+  bool optimized = false;
   if (out_type == ScalarType::Long || out_type == ScalarType::Float ||
       out_type == ScalarType::Byte || out_type == ScalarType::Char)
-    optimized = 1;
-    
-  if(out_type != val_type)
-    optimized = 0;
+    optimized = true;
+
+  if (out_type != val_type)
+    optimized = false;
 
   if (optimized) {
     if (out_type == ScalarType::Long) {
@@ -65,10 +66,7 @@ Tensor& full_out(
       float val;
       extract_scalar(fill_value, &val);
 
-      WORD32 ret_val = xa_nn_memset_f32_f32(
-      data_out,
-      val,
-      out.numel());
+      WORD32 ret_val = xa_nn_memset_f32_f32(data_out, val, out.numel());
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
 
@@ -98,4 +96,5 @@ Tensor& full_out(
 
 } // namespace native
 } // namespace HiFi
-} // namespace impl
\ No newline at end of file
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp
index 37b0aa8b058..bb72eaf521a 100644
--- a/backends/cadence/hifi/operators/op_permute_copy.cpp
+++ b/backends/cadence/hifi/operators/op_permute_copy.cpp
@@ -22,6 +22,7 @@ using torch::executor::check_permute_copy_args;
 using torch::executor::Error;
 using torch::executor::get_permute_copy_out_target_size;
 
+namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -75,11 +76,9 @@ Tensor& permute_copy_out(
 
   bool optimized = false;
 
-  if (out.scalar_type() == ScalarType::Float)
-    optimized = true;
-  else if (out.scalar_type() == ScalarType::Char)
-    optimized = true;
-  else if (out.scalar_type() == ScalarType::Byte)
+  if (out.scalar_type() == ScalarType::Float ||
+      out.scalar_type() == ScalarType::Char ||
+      out.scalar_type() == ScalarType::Byte)
     optimized = true;
 
   if (in.dim() > kNnlibMaxDim)
@@ -103,7 +102,7 @@ Tensor& permute_copy_out(
         p_permute_vec[i] = dims[i];
       }
 
-      WORD32 val = xa_nn_transpose_32_32(
+      WORD32 ret_val = xa_nn_transpose_32_32(
           p_out,
           p_out_shape,
           p_inp,
@@ -112,7 +111,8 @@ Tensor& permute_copy_out(
           num_out_dims,
           num_inp_dims);
 
-      return out;
+      ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
+
     } else if (in_type == ScalarType::Char) {
       WORD8* p_inp = (WORD8*)in.const_data_ptr<char>();
       WORD8* p_out = (WORD8*)out.mutable_data_ptr<char>();
@@ -195,3 +195,4 @@ Tensor& permute_copy_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp
index 3ddf18c7411..031a472cced 100644
--- a/backends/cadence/hifi/operators/quantized_relu_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp
@@ -53,35 +53,25 @@ void quantized_relu_out(
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     Tensor& output) {
-    if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
-    const uint8_t *p_in = input.const_data_ptr<uint8_t>();
-    uint8_t *p_out =output.mutable_data_ptr<uint8_t>();
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    const uint8_t* p_in = input.const_data_ptr<uint8_t>();
+    uint8_t* p_out = output.mutable_data_ptr<uint8_t>();
     uint8_t q_zero_point = in_zero_point.const_data_ptr<uint8_t>()[0];
-    
-    WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u( p_out,
-                                      p_in,
-                                      (int)q_zero_point,
-                                      (int)255,
-                                      input.numel());
-									  ret_val = 5;
 
-    ET_CHECK_MSG(
-        ret_val == 0,
-        "An internal error occured");
+    WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u(
+        p_out, p_in, (int)q_zero_point, (int)255, input.numel());
+
+    ET_CHECK_MSG(ret_val == 0, "An internal error occured");
   } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
-    const int8_t *p_in = input.const_data_ptr<int8_t>();
-    int8_t *p_out = output.mutable_data_ptr<int8_t>();
+    const int8_t* p_in = input.const_data_ptr<int8_t>();
+    int8_t* p_out = output.mutable_data_ptr<int8_t>();
     int8_t q_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
-    
-    WORD32 ret_val = xa_nn_vec_activation_min_max_8_8( p_out,
-                                      p_in,
-                                      (int)q_zero_point,
-                                      (int)128,
-                                      input.numel());
 
-    ET_CHECK_MSG(
-        ret_val == 0,
-        "An internal error occured");
+    WORD32 ret_val = xa_nn_vec_activation_min_max_8_8(
+        p_out, p_in, (int)q_zero_point, (int)128, input.numel());
+
+    ET_CHECK_MSG(ret_val == 0, "An internal error occured");
+
   } else {
     ET_CHECK_MSG(
         false,

From 0c6112aa57d25d884022aa9aa6983ba4aadb9aab Mon Sep 17 00:00:00 2001
From: Nishak <nishak@cadence.com>
Date: Thu, 14 Nov 2024 09:38:44 -0800
Subject: [PATCH 3/3] changing relu op

---
 .../hifi/operators/quantized_relu_out.cpp     | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp
index 031a472cced..98f130df8c7 100644
--- a/backends/cadence/hifi/operators/quantized_relu_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp
@@ -58,17 +58,34 @@ void quantized_relu_out(
     uint8_t* p_out = output.mutable_data_ptr<uint8_t>();
     uint8_t q_zero_point = in_zero_point.const_data_ptr<uint8_t>()[0];
 
-    WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u(
-        p_out, p_in, (int)q_zero_point, (int)255, input.numel());
+    WORD32 ret_val = xa_nn_vec_relu_asym8u_asym8u(
+        p_out,
+        p_in,
+        (int)q_zero_point,
+        out_multiplier.const_data_ptr<int32_t>()[0],
+        out_shift.const_data_ptr<int32_t>()[0],
+        (int)out_zero_point,
+        0,
+        255,
+        input.numel());
 
     ET_CHECK_MSG(ret_val == 0, "An internal error occured");
+
   } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
     const int8_t* p_in = input.const_data_ptr<int8_t>();
     int8_t* p_out = output.mutable_data_ptr<int8_t>();
     int8_t q_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
 
-    WORD32 ret_val = xa_nn_vec_activation_min_max_8_8(
-        p_out, p_in, (int)q_zero_point, (int)128, input.numel());
+    WORD32 ret_val = xa_nn_vec_relu_asym8s_asym8s(
+        p_out,
+        p_in,
+        (int)q_zero_point,
+        out_multiplier.const_data_ptr<int32_t>()[0],
+        out_shift.const_data_ptr<int32_t>()[0],
+        (int)out_zero_point,
+        -128,
+        127,
+        input.numel());
 
     ET_CHECK_MSG(ret_val == 0, "An internal error occured");