dijopaul · dijopaul · Oct 10, 2024 · Oct 10, 2024
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -106,6 +106,11 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::where_out
+
+- op: mean.out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::mean_dim_out   
 
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)

diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -13,8 +13,12 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
 )
 
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
 target_include_directories(
   cadence_kernels
   PUBLIC
@@ -23,6 +27,7 @@ target_include_directories(
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/
+	${_common_include_directories}
 )
 
 target_link_libraries(cadence_kernels PRIVATE xa_nnlib)
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
@@ -51,6 +51,16 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__
                                 const WORD32 *const p_inp1_shape,
                                 const FLOAT32 * __restrict__ p_inp2,
                                 const WORD32 *const p_inp2_shape); 
+
+extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(FLOAT32 * __restrict__ p_out,
+                                const WORD32 *const p_out_shape,
+                                const FLOAT32 * __restrict__ p_inp,
+                                const WORD32 *const p_inp_shape,
+                                const WORD32 * __restrict__ p_axis,
+                                WORD32 num_out_dims,
+                                WORD32 num_inp_dims,
+                                WORD32 num_axis_dims,
+                                void * __restrict__ p_scratch_in);                                
 
 namespace impl {
 namespace HiFi {

diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -26,6 +26,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
@@ -47,6 +48,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
     )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)

diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+
+using exec_aten::Tensor;
+using exec_aten::ScalarType;
+using executorch::aten::RuntimeContext;
+using torch::executor::Error;
+using executorch::runtime::ArrayRef;
+using torch::executor::optional;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+int prepare_data(
+    const Tensor& in,
+    Tensor& out,
+    optional<ArrayRef<int64_t>> dim_list,
+    int *inp_shape,
+    int *out_shape,
+    int *p_axis,
+    int num_inp_dims,
+    int num_out_dims
+    ) {
+
+  for(int i = 0; i < num_inp_dims; i++)
+  {
+    inp_shape[i] = in.size(i);
+  }
+
+  for(int i = 0; i < num_out_dims; i++)
+  {
+    out_shape[i] = out.size(i);
+  }
+
+  int num_axis_dims = 0;
+  for (const auto& d : dim_list.value())
+  {
+    if(d < 0)
+    {
+      p_axis[num_axis_dims] = num_inp_dims + d;
+      num_axis_dims++;
+    }
+    else
+    {
+      p_axis[num_axis_dims] = d;
+      num_axis_dims++;
+    }
+  }
+
+  return num_axis_dims;
+}
+
+Tensor& mean_dim_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_mean_dim_args(in, dim_list, keepdim, dtype, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "mean.out";
+  constexpr int kNnlibMaxDim = 4;
+
+  bool optimized = 1;
+
+  if(out.scalar_type() != ScalarType::Float)
+    optimized = 0;
+
+  if(in.dim() > kNnlibMaxDim)
+    optimized = 0;
+
+  if(optimized)
+  {
+    float * __restrict__ p_out = out.mutable_data_ptr<float>();
+    const float * __restrict__ p_inp = (const float * __restrict__)in.const_data_ptr<float>();
+
+    int num_elm = in.numel();
+
+    int num_inp_dims = in.dim();
+    int num_out_dims = out.dim();
+
+    int inp_shape[kNnlibMaxDim];
+    int out_shape[kNnlibMaxDim];
+    int p_axis[kNnlibMaxDim];
+
+    for(int i = 0; i < kNnlibMaxDim; i++)
+    {
+      out_shape[i] = 1;
+      inp_shape[i] = 1;
+      p_axis[i] = 1;
+    }
+
+    int num_axis_dims = prepare_data(in, out, dim_list, inp_shape, out_shape, p_axis, num_inp_dims, num_out_dims);
+
+    if(num_axis_dims == num_inp_dims)
+    {
+      num_out_dims = 1;
+      out_shape[0] = 1;
+    }
+
+    int scratch_size = xa_nn_reduce_getsize_nhwc(-3,
+                                                inp_shape,
+                                                num_inp_dims,
+                                                p_axis,
+                                                num_axis_dims,
+                                                1);
+
+    void * __restrict__ p_scratch_in = (void * __restrict__)malloc(scratch_size);
+
+    xa_nn_reduce_mean_4D_f32_f32(p_out,
+                                out_shape,
+                                p_inp,
+                                inp_shape,
+                                p_axis,
+                                num_out_dims,
+                                num_inp_dims,
+                                num_axis_dims,
+                                p_scratch_in);
+
+    return out;
+  }
+
+  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const size_t num = get_reduced_dim_product(in, dim_list);
+
+      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+        CTYPE_OUT sum = 0;
+        if (in.numel() > 0) {
+          sum = torch::executor::map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
+              [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+              [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+              in,
+              dim_list,
+              out_ix);
+        }
+        out_data[out_ix] = sum / static_cast<float>(num);
+      }
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl