From a8c4f667f1c8d197fceeee95e7cb184136d6ac74 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Thu, 26 Sep 2024 01:02:14 -0700
Subject: [PATCH] Fixing review comments in 5483

---
 backends/cadence/hifi/operators/op_add.cpp    | 239 ++++++++++++++----
 backends/cadence/hifi/operators/op_div.cpp    | 167 ++++++------
 backends/cadence/hifi/operators/op_mul.cpp    |  75 +++---
 .../cadence/hifi/operators/op_sigmoid.cpp     |  42 +--
 backends/cadence/hifi/operators/op_sub.cpp    |  84 +++---
 backends/cadence/hifi/operators/op_tanh.cpp   |  14 +-
 6 files changed, 380 insertions(+), 241 deletions(-)
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
index 15ba5c250e7..38585b40055 100644
--- a/backends/cadence/hifi/operators/op_add.cpp
+++ b/backends/cadence/hifi/operators/op_add.cpp
@@ -9,82 +9,140 @@
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
-#include "kernels.h"
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
 
-#define NNLIB_MAX_DIM 4  /* Add fallback if broadcast and dim > 4 */
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted + alpha_val * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+template <typename CTYPE_IN>
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug<CTYPE_IN> {};
+
+} // namespace
 
 Tensor& add_out(
-    RuntimeContext& ctx,
+    KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
     const Scalar& alpha,
     Tensor& out) {
-  (void)ctx;
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out);
 
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
-  ScalarType common_type = promoteTypes(a_type, b_type);
+  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
   ScalarType out_type = out.scalar_type();
 
-  ET_CHECK_MSG(a_type == ScalarType::Float, "Input tensor not a float.\n");
-  ET_CHECK_MSG(b_type == ScalarType::Float, "Input tensor not a float.\n");
-  ET_CHECK_MSG(out_type == ScalarType::Float, "Output tensor not a float.\n");
-
-  ET_CHECK(canCast(common_type, out_type));
-
-  using CTYPE_A = float;
-  using CTYPE_B = float;
-  using CTYPE_IN = float;
-  using CTYPE_OUT = float;
-  CTYPE_IN alpha_val;
-  ET_EXTRACT_SCALAR(alpha, alpha_val);
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
+      
+  float alpha_val;
+  utils::extract_scalar(alpha, &alpha_val);
 
+  constexpr auto name = "add.out";
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+  
   int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  int fall_back = 0;
+  bool optimized = 1;
   /*find broadcast*/
-  const int a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const int b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const int broadcast = (a_is_broadcasted || b_is_broadcasted);
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
   
-  if( (out_type != ScalarType::Float) || (alpha_val != 1.0))
-    fall_back = 1;
+  if((out_type != ScalarType::Float) || (alpha_val != 1.0))
+    optimized = 0;
   
-  if( (a_dim == 0) || (b_dim == 0) )
-    fall_back = 1;
+  if((a_dim == 0) || (b_dim == 0) )
+    optimized = 0;
 
-  if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM))
-    fall_back = 1;
+  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
 
 
-  if (!fall_back)
+  if(optimized)
   {
       const float* const a_data = a.const_data_ptr<float>();
       const float* const b_data = b.const_data_ptr<float>();
       float* const out_data = out.mutable_data_ptr<float>();
       if(broadcast == 1)
       {
-         int out_shape[NNLIB_MAX_DIM];
-         int inp1_shape[NNLIB_MAX_DIM];
-         int inp2_shape[NNLIB_MAX_DIM];
+         int out_shape[kNnlibMaxDim];
+         int inp1_shape[kNnlibMaxDim];
+         int inp2_shape[kNnlibMaxDim];
          
-         for(int i = 0; i < NNLIB_MAX_DIM; i++)
+         for(int i = 0; i < kNnlibMaxDim; i++)
          {
             out_shape[i] = 1;
             inp1_shape[i] = 1;
             inp2_shape[i] = 1;
          }
                   
-         int off_o = NNLIB_MAX_DIM - out.dim();
-         int off_a = NNLIB_MAX_DIM - a.dim();
-         int off_b = NNLIB_MAX_DIM - b.dim();
+         int off_o = kNnlibMaxDim - out.dim();
+         int off_a = kNnlibMaxDim - a.dim();
+         int off_b = kNnlibMaxDim - b.dim();
          
          for(int i = 0; i < out.dim(); i++)
              out_shape[i+off_o] = out.size(i);
@@ -97,24 +155,109 @@ Tensor& add_out(
                                                 b_data, inp2_shape);
       }                      
       else
+      {
         xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
-      
+      }
+
+      return out;
   }
-  else
-  {
-      apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-      [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
-        CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-        CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-        CTYPE_IN value = a_casted + alpha_val * b_casted;
-
-        return static_cast<CTYPE_OUT>(value);
-      },
-      a,
-      b,
+  
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      CTYPE_IN alpha_val;
+      utils::extract_scalar(alpha, &alpha_val);
+
+      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+        AddInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, alpha_val, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+Tensor& add_scalar_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    const Scalar& alpha,
+    Tensor& out) {
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, a.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbbf16_type(out),
+      InvalidArgument,
       out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = utils::get_scalar_dtype(b);
+  ScalarType alpha_type = utils::get_scalar_dtype(alpha);
+  ScalarType common_type =
+      utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out);
+
+  /*When Half first compute the result in float precision 
+  and then downcast to half*/
+  if (common_type == ScalarType::Half) {
+    common_type = ScalarType::Float;
   }
 
+  constexpr auto name = "add.Scalar_out";
+
+  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      using CTYPE_IN = typename utils::promote_type_with_scalar_type<
+          CTYPE_A,
+          CTYPE_B,
+          /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+
+      CTYPE_B b_val;
+      utils::extract_scalar(b, &b_val);
+      CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
+
+      CTYPE_IN alpha_val;
+      utils::extract_scalar(alpha, &alpha_val);
+
+      using CTYPE_OUT = typename std::conditional<
+          std::is_same<CTYPE_A, internal::F2>::value,
+          internal::F2,
+          CTYPE_IN>::type;
+
+      apply_unary_map_fn(
+          [b_casted, alpha_val](const CTYPE_A val_a) {
+            CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+            CTYPE_IN value = a_casted + alpha_val * b_casted;
+            return static_cast<CTYPE_OUT>(value);
+          },
+          a.const_data_ptr<CTYPE_A>(),
+          out.mutable_data_ptr<CTYPE_OUT>(),
+          out.numel());
+    });
+  });
+
   return out;
 }
 
diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
index dc6a22ea4de..057132e7bc7 100644
--- a/backends/cadence/hifi/operators/op_div.cpp
+++ b/backends/cadence/hifi/operators/op_div.cpp
@@ -13,14 +13,11 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 #include <cmath> 
-
-#include "kernels.h"
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
 namespace torch {
 namespace executor {
 namespace native {
-    
-#define NNLIB_MAX_DIM 4  /* Add fallback if broadcast and dim > 4 */
 
 namespace {
 
@@ -61,25 +58,26 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 
   ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out);
   
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
   int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  int fall_back = 0;
+  bool optimized = 1;
   /*find broadcast*/
-  const int a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const int b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const int broadcast = (a_is_broadcasted || b_is_broadcasted);
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
   
   if((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    fall_back = 1;
+    optimized = 0;
   
-  if( (a_dim == 0) || (b_dim == 0) )
-    fall_back = 1;
+  if((a_dim == 0) || (b_dim == 0) )
+    optimized = 0;
 
-  if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM))
-    fall_back = 1;
+  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
   
-  if(!fall_back)
+  if(optimized)
   {
     float* a_data = a.mutable_data_ptr<float>();
     float* b_data = b.mutable_data_ptr<float>();
@@ -88,20 +86,20 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
     if(broadcast == 1)
     {
       
-      int out_shape[NNLIB_MAX_DIM];
-      int inp1_shape[NNLIB_MAX_DIM];
-      int inp2_shape[NNLIB_MAX_DIM];
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
       
-      for(int i = 0; i < NNLIB_MAX_DIM; i++)
+      for(int i = 0; i < kNnlibMaxDim; i++)
       {
         out_shape[i] = 1;
         inp1_shape[i] = 1;
         inp2_shape[i] = 1;
       }
         
-      int off_o = NNLIB_MAX_DIM - out.dim();
-      int off_a = NNLIB_MAX_DIM - a.dim();
-      int off_b = NNLIB_MAX_DIM - b.dim();
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
       for(int i = 0; i < out.dim(); i++)
         out_shape[i+off_o] = out.size(i);
       for(int i = 0; i < a.dim(); i++)
@@ -116,34 +114,34 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 
       xa_nn_elm_div_f32xf32_f32(out_data, a_data, b_data, out.numel());
     }
+    
+    return out;
   }
-  else
-  {
-    ScalarType common_type = get_compute_type(a_type, b_type);
-    ScalarType out_type = out.scalar_type();
   
-    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+  ScalarType common_type = get_compute_type(a_type, b_type);
+  ScalarType out_type = out.scalar_type();
   
-    ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
-      ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
-        ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
-          ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
-            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                  CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                  CTYPE_IN value = a_casted / b_casted;
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
   
-                  return static_cast<CTYPE_OUT>(value);
-                },
-                a,
-                b,
-                out);
-          });
+  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
+    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
+      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
+        ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
+          apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+              [](const CTYPE_A val_a, const CTYPE_B val_b) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                CTYPE_IN value = a_casted / b_casted;
+  
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a,
+              b,
+              out);
         });
       });
     });
-  }
+  });
 
   return out;
 }
@@ -174,33 +172,33 @@ Tensor& div_out_mode(
       !(common_type != ScalarType::Bool && out_type == ScalarType::Bool),
       InvalidArgument,
       out);
-      
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
   int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  int fall_back = 0;
+  bool optimized = 1;
   /*find broadcast*/
-  const int a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const int b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const int broadcast = (a_is_broadcasted || b_is_broadcasted);
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
   
   if((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    fall_back = 1;
+    optimized = 0;
   
-  if( (a_dim == 0) || (b_dim == 0) )
-    fall_back = 1;
+  if((a_dim == 0) || (b_dim == 0))
+    optimized = 0;
 
-  if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM))
-    fall_back = 1;
+  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
   int mode_val = -1;
   if (mode.has_value() && mode.value() == "trunc") 
     mode_val = 0;
   else if (mode.has_value() && mode.value() == "floor")
     mode_val = 1;
   else
-    fall_back = 1;
+    optimized = 0;
       
-  if(!fall_back)
+  if(optimized)
   {
     float* a_data = a.mutable_data_ptr<float>();
     float* b_data = b.mutable_data_ptr<float>();
@@ -208,20 +206,20 @@ Tensor& div_out_mode(
 
     if(broadcast)
     {
-      int out_shape[NNLIB_MAX_DIM];
-      int inp1_shape[NNLIB_MAX_DIM];
-      int inp2_shape[NNLIB_MAX_DIM];
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
       
-      for(int i = 0; i < NNLIB_MAX_DIM; i++)
+      for(int i = 0; i < kNnlibMaxDim; i++)
       {
         inp1_shape[i] = 1;
         inp2_shape[i] = 1;
         out_shape[i] = 1;
       }
 
-      int off_o = NNLIB_MAX_DIM - out.dim();
-      int off_a = NNLIB_MAX_DIM - a.dim();
-      int off_b = NNLIB_MAX_DIM - b.dim();
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
 
       for(int i = 0; i < out.dim(); i++)
         out_shape[i+off_o] = out.size(i);
@@ -236,33 +234,33 @@ Tensor& div_out_mode(
     {
       xa_nn_elm_div_mode_f32xf32_f32(out_data, a_data, b_data, out.numel(), mode_val);
     }
+    
+    return out;
   }
-  else
-  {
-    ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() {
-      ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() {
-        ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() {
-          ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() {
-            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                [mode](const CTYPE_A val_a, const CTYPE_B val_b) {
-                  CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                  CTYPE_IN value = a_casted / b_casted;
-                  if (mode.has_value() && mode.value() == "trunc") {
-                    value = std::trunc(value);
-                  } else if (mode.has_value() && mode.value() == "floor") {
-                    value = std::floor(value);
-                  }
-                  return static_cast<CTYPE_OUT>(value);
-                },
-                a,
-                b,
-                out);
-          });
+
+  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() {
+    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() {
+      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() {
+        ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() {
+          apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+              [mode](const CTYPE_A val_a, const CTYPE_B val_b) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                CTYPE_IN value = a_casted / b_casted;
+                if (mode.has_value() && mode.value() == "trunc") {
+                  value = std::trunc(value);
+                } else if (mode.has_value() && mode.value() == "floor") {
+                  value = std::floor(value);
+                }
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a,
+              b,
+              out);
         });
       });
     });
-  }
+  });
 
   return out;
 }
@@ -318,7 +316,6 @@ Tensor& div_scalar_mode_out(
     const Scalar& b,
     exec_aten::optional<exec_aten::string_view> mode,
     Tensor& out) {
-  (void)ctx;
 
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
index c430bfa0740..05173e831c6 100644
--- a/backends/cadence/hifi/operators/op_mul.cpp
+++ b/backends/cadence/hifi/operators/op_mul.cpp
@@ -11,9 +11,8 @@
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
-#include "kernels.h"
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
-#define NNLIB_MAX_DIM 4  /* Add fallback if broadcast and dim > 4 */
 
 namespace torch {
 namespace executor { 
@@ -79,27 +78,28 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
   ScalarType out_type = out.scalar_type();
+  constexpr int kNnlibMaxDim = 4;  /*fallback if broadcast and dim > 4 */
   
   int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  int fall_back = 0;
+  bool optimized = 1;
   /*find broadcast*/
-  const int a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const int b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const int broadcast = (a_is_broadcasted || b_is_broadcasted);
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
   
   if((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-      fall_back = 1;
+    optimized = 0;
   
   if( (a_dim == 0) || (b_dim == 0) )
-    fall_back = 1;
+    optimized = 0;
   
-  if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM))
-      fall_back = 1;
+  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
 
-  if(!fall_back)
+  if(optimized)
   {
     float* a_data = a.mutable_data_ptr<float>();
     float* b_data = b.mutable_data_ptr<float>();
@@ -107,18 +107,18 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 
     if(broadcast == 1)
     {
-       int out_shape[NNLIB_MAX_DIM];
-       int inp1_shape[NNLIB_MAX_DIM];
-       int inp2_shape[NNLIB_MAX_DIM];
-       for(int i = 0; i < NNLIB_MAX_DIM; i++)
+       int out_shape[kNnlibMaxDim];
+       int inp1_shape[kNnlibMaxDim];
+       int inp2_shape[kNnlibMaxDim];
+       for(int i = 0; i < kNnlibMaxDim; i++)
        {
           out_shape[i] = 1;
           inp1_shape[i] = 1;
           inp2_shape[i] = 1;
        }
-       int off_o = NNLIB_MAX_DIM - out.dim();
-       int off_a = NNLIB_MAX_DIM - a.dim();
-       int off_b = NNLIB_MAX_DIM - b.dim();
+       int off_o = kNnlibMaxDim - out.dim();
+       int off_a = kNnlibMaxDim - a.dim();
+       int off_b = kNnlibMaxDim - b.dim();
        for(int i = 0; i < out.dim(); i++){
             out_shape[i+off_o] = out.size(i);}
        for(int i = 0; i < a.dim(); i++)
@@ -132,26 +132,26 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
     {
         xa_nn_elm_mul_f32xf32_f32(out_data, a_data, b_data, out.numel());
     }
-  }
-  else
-  {
-    ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
-        using CTYPE_IN = typename torch::executor::
-            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
-          MulInner<
-              can_cast<CTYPE_IN, CTYPE_OUT>::value,
-              CTYPE_A,
-              CTYPE_B,
-              CTYPE_IN,
-              CTYPE_OUT>::run(a, b, out); 
-        });
-      });
-    }); 
+    
+    return out;
   }
 
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
+        MulInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out); 
+      });
+    });
+  });
+  
   return out;
 }
 
@@ -160,7 +160,6 @@ Tensor& mul_scalar_out(
     const Tensor& a,
     const Scalar& b,
     Tensor& out) {
-  (void)ctx;
 
   // Resize for dynamic shape
   ET_KERNEL_CHECK_MSG(
@@ -180,6 +179,8 @@ Tensor& mul_scalar_out(
 
   ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
 
+  /*When Half first compute the result in float precision 
+    and then downcast to half*/
   if (common_type == ScalarType::Half) {
     common_type = ScalarType::Float;
   }
diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp
index 0b7a72bcc73..6c54e053f9c 100644
--- a/backends/cadence/hifi/operators/op_sigmoid.cpp
+++ b/backends/cadence/hifi/operators/op_sigmoid.cpp
@@ -10,7 +10,7 @@
 
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
 namespace torch {
 namespace executor {
@@ -36,33 +36,33 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
   ScalarType in_type = in.scalar_type();
   ScalarType out_type = out.scalar_type();
   
-  int fall_back = 0;
+  bool optimized = 1;
   if((in_type != ScalarType::Float) || (out_type != ScalarType::Float))
-      fall_back = 1;
+      optimized = 0;
   
-  if(!fall_back)
+  if(optimized)
   {
     float* data_in = in.mutable_data_ptr<float>();
     float* data_out = out.mutable_data_ptr<float>();
     xa_nn_vec_sigmoid_f32_f32(data_out, data_in, in.numel());
+    
+    return out;
   }
-  else
-  {
-    ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
-      ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
-        apply_unary_map_fn(
-            [](const CTYPE_IN val_in) {
-              // perform math in double to preserve precision
-              double in_casted = static_cast<double>(val_in);
-              double out_val = 1.0 / (1.0 + exp(-in_casted));
-              return static_cast<CTYPE_OUT>(out_val);
-            },
-            in.const_data_ptr<CTYPE_IN>(),
-            out.mutable_data_ptr<CTYPE_OUT>(),
-            in.numel());
-      });
-    }); 
-  }
+
+  ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
+    ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
+      apply_unary_map_fn(
+          [](const CTYPE_IN val_in) {
+            // perform math in double to preserve precision
+            double in_casted = static_cast<double>(val_in);
+            double out_val = 1.0 / (1.0 + exp(-in_casted));
+            return static_cast<CTYPE_OUT>(out_val);
+          },
+          in.const_data_ptr<CTYPE_IN>(),
+          out.mutable_data_ptr<CTYPE_OUT>(),
+          in.numel());
+    });
+  }); 
 
   return out;
 }
diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
index a98bb7c0da2..d297bc0c699 100644
--- a/backends/cadence/hifi/operators/op_sub.cpp
+++ b/backends/cadence/hifi/operators/op_sub.cpp
@@ -12,9 +12,7 @@
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
-#include "kernels.h"
-
-#define NNLIB_MAX_DIM 4  /* Add fallback if broadcast and dim > 4 */
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
 namespace torch {
 namespace executor {
@@ -97,27 +95,28 @@ Tensor& sub_out(
   utils::extract_scalar(alpha, &alpha_val);
 
   constexpr auto name = "sub.out";
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
   
   int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
-  int fall_back = 0;
+  bool optimized = 1;
   /*find broadcast*/
-  const int a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const int b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const int broadcast = (a_is_broadcasted || b_is_broadcasted);
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
   
-  if( (out_type != ScalarType::Float) || (alpha_val != 1.0))
-    fall_back = 1;
+  if((out_type != ScalarType::Float) || (alpha_val != 1.0))
+    optimized = 0;
   
-  if( (a_dim == 0) || (b_dim == 0) )
-    fall_back = 1;
+  if((a_dim == 0) || (b_dim == 0))
+    optimized = 0;
 
-  if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM))
-    fall_back = 1;
+  if((broadcast == 1) && (max_dim > kNnlibMaxDim))
+    optimized = 0;
   
 
-  if(!fall_back)
+  if(optimized)
   {
       /*logic to find broadcast*/
       const int a_is_broadcasted = !out.sizes().equals(a.sizes());
@@ -129,20 +128,20 @@ Tensor& sub_out(
       float* const out_data = out.mutable_data_ptr<float>();
       if(broadcast == 1)
       {
-         int out_shape[NNLIB_MAX_DIM];
-         int inp1_shape[NNLIB_MAX_DIM];
-         int inp2_shape[NNLIB_MAX_DIM];
+         int out_shape[kNnlibMaxDim];
+         int inp1_shape[kNnlibMaxDim];
+         int inp2_shape[kNnlibMaxDim];
          
-         for(int i = 0; i < NNLIB_MAX_DIM; i++)
+         for(int i = 0; i < kNnlibMaxDim; i++)
          {
             out_shape[i] = 1;
             inp1_shape[i] = 1;
             inp2_shape[i] = 1;
          }
 
-         int off_o = NNLIB_MAX_DIM - out_dim;
-         int off_a = NNLIB_MAX_DIM - a_dim;
-         int off_b = NNLIB_MAX_DIM - b_dim;
+         int off_o = kNnlibMaxDim - out_dim;
+         int off_a = kNnlibMaxDim - a_dim;
+         int off_b = kNnlibMaxDim - b_dim;
          for(int i = 0; i < out_dim; i++)
              out_shape[i+off_o] = out.size(i);
          for(int i = 0; i < a_dim; i++)
@@ -156,29 +155,28 @@ Tensor& sub_out(
       {
          xa_nn_elm_sub_f32xf32_f32(out_data, a_data, b_data, out.numel());
       }
-
+      
+      return out;
   }
-  else
-  {
+
   
-     ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-      ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
-      using CTYPE_IN = typename torch::executor::
-          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      CTYPE_IN alpha_val;
-      utils::extract_scalar(alpha, &alpha_val);
-      ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
-        SubInner<
-            can_cast<CTYPE_IN, CTYPE_OUT>::value,
-            CTYPE_A,
-            CTYPE_B,
-            CTYPE_IN,
-            CTYPE_OUT>::run(a, b, alpha_val, out);
-         });
-       });
-     });
-  }
+  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+   ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+   using CTYPE_IN = typename torch::executor::
+       promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+   ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+   CTYPE_IN alpha_val;
+   utils::extract_scalar(alpha, &alpha_val);
+   ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+     SubInner<
+         can_cast<CTYPE_IN, CTYPE_OUT>::value,
+         CTYPE_A,
+         CTYPE_B,
+         CTYPE_IN,
+         CTYPE_OUT>::run(a, b, alpha_val, out);
+      });
+    });
+  });
 
   return out;
 }
@@ -211,6 +209,8 @@ Tensor& sub_scalar_out(
   ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, canCast(alpha_type, common_type), InvalidArgument, out);
 
+  /*When Half first compute the result in float precision 
+  and then downcast to half*/
   if (common_type == ScalarType::Half) {
     common_type = ScalarType::Float;
   }
diff --git a/backends/cadence/hifi/operators/op_tanh.cpp b/backends/cadence/hifi/operators/op_tanh.cpp
index 47a057fe774..f06b57a3688 100644
--- a/backends/cadence/hifi/operators/op_tanh.cpp
+++ b/backends/cadence/hifi/operators/op_tanh.cpp
@@ -9,7 +9,7 @@
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cmath>
-#include "kernels.h"
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
 namespace torch {
 namespace executor {
@@ -17,21 +17,19 @@ namespace native {
 
 Tensor& tanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 
-  int fall_back = 0;
+  bool optimized = 1;
   if((in.scalar_type() != ScalarType::Float) || (out.scalar_type() != ScalarType::Float))
-      fall_back = 1;
+      optimized = 0;
   
-  if(!fall_back)
+  if(optimized)
   {
     float* data_in = in.mutable_data_ptr<float>();
     float* data_out = out.mutable_data_ptr<float>();
     xa_nn_vec_tanh_f32_f32(data_out, data_in, (int)in.numel());
     return out;
   }
-  else
-  {
-    return internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out);
-  }
+
+  return internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out);
 
 }