From a8c4f667f1c8d197fceeee95e7cb184136d6ac74 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Thu, 26 Sep 2024 01:02:14 -0700 Subject: [PATCH] Fixing review comments in 5483 --- backends/cadence/hifi/operators/op_add.cpp | 239 ++++++++++++++---- backends/cadence/hifi/operators/op_div.cpp | 167 ++++++------ backends/cadence/hifi/operators/op_mul.cpp | 75 +++--- .../cadence/hifi/operators/op_sigmoid.cpp | 42 +-- backends/cadence/hifi/operators/op_sub.cpp | 84 +++--- backends/cadence/hifi/operators/op_tanh.cpp | 14 +- 6 files changed, 380 insertions(+), 241 deletions(-) diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp index 15ba5c250e7..38585b40055 100644 --- a/backends/cadence/hifi/operators/op_add.cpp +++ b/backends/cadence/hifi/operators/op_add.cpp @@ -9,82 +9,140 @@ #include #include #include +#include #include #include -#include "kernels.h" +#include namespace torch { namespace executor { namespace native { +namespace { -#define NNLIB_MAX_DIM 4 /* Add fallback if broadcast and dim > 4 */ +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct AddInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct AddInner { + static void + run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted + alpha_val * b_casted; + + return static_cast(value); + }, + a, + b, + out); + } +}; + +template +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct AddInner + : public ReportCanCastBug {}; + +} // namespace Tensor& add_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& a, const Tensor& b, const Scalar& alpha, Tensor& out) { - (void)ctx; + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, + out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); - ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType alpha_type = utils::get_scalar_dtype(alpha); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); ScalarType out_type = out.scalar_type(); - ET_CHECK_MSG(a_type == ScalarType::Float, "Input tensor not a float.\n"); - ET_CHECK_MSG(b_type == ScalarType::Float, "Input tensor not a float.\n"); - ET_CHECK_MSG(out_type == ScalarType::Float, "Output tensor not a float.\n"); - - ET_CHECK(canCast(common_type, out_type)); - - using CTYPE_A = float; - using CTYPE_B = float; - using CTYPE_IN = float; - using CTYPE_OUT = float; - CTYPE_IN alpha_val; - ET_EXTRACT_SCALAR(alpha, alpha_val); + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out); + + float alpha_val; + utils::extract_scalar(alpha, &alpha_val); + constexpr auto name = "add.out"; + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); - int fall_back = 0; + bool optimized = 1; /*find broadcast*/ - const int a_is_broadcasted = !out.sizes().equals(a.sizes()); - const int b_is_broadcasted = !out.sizes().equals(b.sizes()); - const int broadcast = (a_is_broadcasted || b_is_broadcasted); + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; - if( (out_type != ScalarType::Float) || (alpha_val != 1.0)) - fall_back = 1; + if((out_type != ScalarType::Float) || (alpha_val != 1.0)) + optimized = 0; - if( (a_dim == 0) || (b_dim == 0) ) - fall_back = 1; + if((a_dim == 0) || (b_dim == 0) ) + optimized = 0; - if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) - fall_back = 1; + if((broadcast == 1) && (max_dim > kNnlibMaxDim)) + optimized = 0; - if (!fall_back) + if(optimized) { const float* const a_data = a.const_data_ptr(); const float* const b_data = b.const_data_ptr(); float* const out_data = out.mutable_data_ptr(); if(broadcast == 1) { - int out_shape[NNLIB_MAX_DIM]; - int inp1_shape[NNLIB_MAX_DIM]; - int inp2_shape[NNLIB_MAX_DIM]; + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; - for(int i = 0; i < NNLIB_MAX_DIM; i++) + for(int i = 0; i < kNnlibMaxDim; i++) { out_shape[i] = 1; inp1_shape[i] = 1; inp2_shape[i] = 1; } - int off_o = NNLIB_MAX_DIM - out.dim(); - int off_a = NNLIB_MAX_DIM - a.dim(); - int off_b = NNLIB_MAX_DIM - b.dim(); + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); for(int i = 0; i < out.dim(); i++) out_shape[i+off_o] = out.size(i); @@ -97,24 +155,109 @@ Tensor& add_out( b_data, inp2_shape); } else + { xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel()); - + } + + return out; } - else - { - apply_binary_elementwise_fn( - [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted + alpha_val * b_casted; - - return static_cast(value); - }, - a, - b, + + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + CTYPE_IN alpha_val; + utils::extract_scalar(alpha, &alpha_val); + + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + AddInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, alpha_val, out); + }); + }); + }); + + return out; +} + +Tensor& add_scalar_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + const Scalar& alpha, + Tensor& out) { + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_realhbbf16_type(out), + InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType alpha_type = utils::get_scalar_dtype(alpha); + ScalarType common_type = + utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + ET_KERNEL_CHECK( + ctx, check_alpha_type(alpha_type, common_type), InvalidArgument, out); + + /*When Half first compute the result in float precision + and then downcast to half*/ + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; } + constexpr auto name = "add.Scalar_out"; + + ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_IN = typename utils::promote_type_with_scalar_type< + CTYPE_A, + CTYPE_B, + /*half_to_float*/ true>::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + + CTYPE_B b_val; + utils::extract_scalar(b, &b_val); + CTYPE_IN b_casted = static_cast(b_val); + + CTYPE_IN alpha_val; + utils::extract_scalar(alpha, &alpha_val); + + using CTYPE_OUT = typename std::conditional< + std::is_same::value, + internal::F2, + CTYPE_IN>::type; + + apply_unary_map_fn( + [b_casted, alpha_val](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN value = a_casted + alpha_val * b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + return out; } diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp index dc6a22ea4de..057132e7bc7 100644 --- a/backends/cadence/hifi/operators/op_div.cpp +++ b/backends/cadence/hifi/operators/op_div.cpp @@ -13,14 +13,11 @@ #include #include #include - -#include "kernels.h" +#include namespace torch { namespace executor { namespace native { - -#define NNLIB_MAX_DIM 4 /* Add fallback if broadcast and dim > 4 */ namespace { @@ -61,25 +58,26 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); - int fall_back = 0; + bool optimized = 1; /*find broadcast*/ - const int a_is_broadcasted = !out.sizes().equals(a.sizes()); - const int b_is_broadcasted = !out.sizes().equals(b.sizes()); - const int broadcast = (a_is_broadcasted || b_is_broadcasted); + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) - fall_back = 1; + optimized = 0; - if( (a_dim == 0) || (b_dim == 0) ) - fall_back = 1; + if((a_dim == 0) || (b_dim == 0) ) + optimized = 0; - if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) - fall_back = 1; + if((broadcast == 1) && (max_dim > kNnlibMaxDim)) + optimized = 0; - if(!fall_back) + if(optimized) { float* a_data = a.mutable_data_ptr(); float* b_data = b.mutable_data_ptr(); @@ -88,20 +86,20 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { if(broadcast == 1) { - int out_shape[NNLIB_MAX_DIM]; - int inp1_shape[NNLIB_MAX_DIM]; - int inp2_shape[NNLIB_MAX_DIM]; + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; - for(int i = 0; i < NNLIB_MAX_DIM; i++) + for(int i = 0; i < kNnlibMaxDim; i++) { out_shape[i] = 1; inp1_shape[i] = 1; inp2_shape[i] = 1; } - int off_o = NNLIB_MAX_DIM - out.dim(); - int off_a = NNLIB_MAX_DIM - a.dim(); - int off_b = NNLIB_MAX_DIM - b.dim(); + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); for(int i = 0; i < out.dim(); i++) out_shape[i+off_o] = out.size(i); for(int i = 0; i < a.dim(); i++) @@ -116,34 +114,34 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { xa_nn_elm_div_f32xf32_f32(out_data, a_data, b_data, out.numel()); } + + return out; } - else - { - ScalarType common_type = get_compute_type(a_type, b_type); - ScalarType out_type = out.scalar_type(); - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + ScalarType common_type = get_compute_type(a_type, b_type); + ScalarType out_type = out.scalar_type(); - ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() { - ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() { - ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted / b_casted; + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - return static_cast(value); - }, - a, - b, - out); - }); + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() { + ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() { + ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted / b_casted; + + return static_cast(value); + }, + a, + b, + out); }); }); }); - } + }); return out; } @@ -174,33 +172,33 @@ Tensor& div_out_mode( !(common_type != ScalarType::Bool && out_type == ScalarType::Bool), InvalidArgument, out); - + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); - int fall_back = 0; + bool optimized = 1; /*find broadcast*/ - const int a_is_broadcasted = !out.sizes().equals(a.sizes()); - const int b_is_broadcasted = !out.sizes().equals(b.sizes()); - const int broadcast = (a_is_broadcasted || b_is_broadcasted); + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) - fall_back = 1; + optimized = 0; - if( (a_dim == 0) || (b_dim == 0) ) - fall_back = 1; + if((a_dim == 0) || (b_dim == 0)) + optimized = 0; - if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) - fall_back = 1; + if((broadcast == 1) && (max_dim > kNnlibMaxDim)) + optimized = 0; int mode_val = -1; if (mode.has_value() && mode.value() == "trunc") mode_val = 0; else if (mode.has_value() && mode.value() == "floor") mode_val = 1; else - fall_back = 1; + optimized = 0; - if(!fall_back) + if(optimized) { float* a_data = a.mutable_data_ptr(); float* b_data = b.mutable_data_ptr(); @@ -208,20 +206,20 @@ Tensor& div_out_mode( if(broadcast) { - int out_shape[NNLIB_MAX_DIM]; - int inp1_shape[NNLIB_MAX_DIM]; - int inp2_shape[NNLIB_MAX_DIM]; + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; - for(int i = 0; i < NNLIB_MAX_DIM; i++) + for(int i = 0; i < kNnlibMaxDim; i++) { inp1_shape[i] = 1; inp2_shape[i] = 1; out_shape[i] = 1; } - int off_o = NNLIB_MAX_DIM - out.dim(); - int off_a = NNLIB_MAX_DIM - a.dim(); - int off_b = NNLIB_MAX_DIM - b.dim(); + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); for(int i = 0; i < out.dim(); i++) out_shape[i+off_o] = out.size(i); @@ -236,33 +234,33 @@ Tensor& div_out_mode( { xa_nn_elm_div_mode_f32xf32_f32(out_data, a_data, b_data, out.numel(), mode_val); } + + return out; } - else - { - ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() { - ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() { - apply_binary_elementwise_fn( - [mode](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = a_casted / b_casted; - if (mode.has_value() && mode.value() == "trunc") { - value = std::trunc(value); - } else if (mode.has_value() && mode.value() == "floor") { - value = std::floor(value); - } - return static_cast(value); - }, - a, - b, - out); - }); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() { + ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [mode](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = a_casted / b_casted; + if (mode.has_value() && mode.value() == "trunc") { + value = std::trunc(value); + } else if (mode.has_value() && mode.value() == "floor") { + value = std::floor(value); + } + return static_cast(value); + }, + a, + b, + out); }); }); }); - } + }); return out; } @@ -318,7 +316,6 @@ Tensor& div_scalar_mode_out( const Scalar& b, exec_aten::optional mode, Tensor& out) { - (void)ctx; // Resize for dynamic shape ET_KERNEL_CHECK_MSG( diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp index c430bfa0740..05173e831c6 100644 --- a/backends/cadence/hifi/operators/op_mul.cpp +++ b/backends/cadence/hifi/operators/op_mul.cpp @@ -11,9 +11,8 @@ #include #include #include -#include "kernels.h" +#include -#define NNLIB_MAX_DIM 4 /* Add fallback if broadcast and dim > 4 */ namespace torch { namespace executor { @@ -79,27 +78,28 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { ScalarType b_type = b.scalar_type(); ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); ScalarType out_type = out.scalar_type(); + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); - int fall_back = 0; + bool optimized = 1; /*find broadcast*/ - const int a_is_broadcasted = !out.sizes().equals(a.sizes()); - const int b_is_broadcasted = !out.sizes().equals(b.sizes()); - const int broadcast = (a_is_broadcasted || b_is_broadcasted); + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) - fall_back = 1; + optimized = 0; if( (a_dim == 0) || (b_dim == 0) ) - fall_back = 1; + optimized = 0; - if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) - fall_back = 1; + if((broadcast == 1) && (max_dim > kNnlibMaxDim)) + optimized = 0; - if(!fall_back) + if(optimized) { float* a_data = a.mutable_data_ptr(); float* b_data = b.mutable_data_ptr(); @@ -107,18 +107,18 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { if(broadcast == 1) { - int out_shape[NNLIB_MAX_DIM]; - int inp1_shape[NNLIB_MAX_DIM]; - int inp2_shape[NNLIB_MAX_DIM]; - for(int i = 0; i < NNLIB_MAX_DIM; i++) + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + for(int i = 0; i < kNnlibMaxDim; i++) { out_shape[i] = 1; inp1_shape[i] = 1; inp2_shape[i] = 1; } - int off_o = NNLIB_MAX_DIM - out.dim(); - int off_a = NNLIB_MAX_DIM - a.dim(); - int off_b = NNLIB_MAX_DIM - b.dim(); + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); for(int i = 0; i < out.dim(); i++){ out_shape[i+off_o] = out.size(i);} for(int i = 0; i < a.dim(); i++) @@ -132,26 +132,26 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { { xa_nn_elm_mul_f32xf32_f32(out_data, a_data, b_data, out.numel()); } - } - else - { - ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { - using CTYPE_IN = typename torch::executor:: - promote_types::type; - ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { - MulInner< - can_cast::value, - CTYPE_A, - CTYPE_B, - CTYPE_IN, - CTYPE_OUT>::run(a, b, out); - }); - }); - }); + + return out; } + ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { + MulInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + return out; } @@ -160,7 +160,6 @@ Tensor& mul_scalar_out( const Tensor& a, const Scalar& b, Tensor& out) { - (void)ctx; // Resize for dynamic shape ET_KERNEL_CHECK_MSG( @@ -180,6 +179,8 @@ Tensor& mul_scalar_out( ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + /*When Half first compute the result in float precision + and then downcast to half*/ if (common_type == ScalarType::Half) { common_type = ScalarType::Float; } diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp index 0b7a72bcc73..6c54e053f9c 100644 --- a/backends/cadence/hifi/operators/op_sigmoid.cpp +++ b/backends/cadence/hifi/operators/op_sigmoid.cpp @@ -10,7 +10,7 @@ #include #include -#include "kernels.h" +#include namespace torch { namespace executor { @@ -36,33 +36,33 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { ScalarType in_type = in.scalar_type(); ScalarType out_type = out.scalar_type(); - int fall_back = 0; + bool optimized = 1; if((in_type != ScalarType::Float) || (out_type != ScalarType::Float)) - fall_back = 1; + optimized = 0; - if(!fall_back) + if(optimized) { float* data_in = in.mutable_data_ptr(); float* data_out = out.mutable_data_ptr(); xa_nn_vec_sigmoid_f32_f32(data_out, data_in, in.numel()); + + return out; } - else - { - ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() { - ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() { - apply_unary_map_fn( - [](const CTYPE_IN val_in) { - // perform math in double to preserve precision - double in_casted = static_cast(val_in); - double out_val = 1.0 / (1.0 + exp(-in_casted)); - return static_cast(out_val); - }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); - }); - }); - } + + ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() { + ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() { + apply_unary_map_fn( + [](const CTYPE_IN val_in) { + // perform math in double to preserve precision + double in_casted = static_cast(val_in); + double out_val = 1.0 / (1.0 + exp(-in_casted)); + return static_cast(out_val); + }, + in.const_data_ptr(), + out.mutable_data_ptr(), + in.numel()); + }); + }); return out; } diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp index a98bb7c0da2..d297bc0c699 100644 --- a/backends/cadence/hifi/operators/op_sub.cpp +++ b/backends/cadence/hifi/operators/op_sub.cpp @@ -12,9 +12,7 @@ #include #include #include -#include "kernels.h" - -#define NNLIB_MAX_DIM 4 /* Add fallback if broadcast and dim > 4 */ +#include namespace torch { namespace executor { @@ -97,27 +95,28 @@ Tensor& sub_out( utils::extract_scalar(alpha, &alpha_val); constexpr auto name = "sub.out"; + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); - int fall_back = 0; + bool optimized = 1; /*find broadcast*/ - const int a_is_broadcasted = !out.sizes().equals(a.sizes()); - const int b_is_broadcasted = !out.sizes().equals(b.sizes()); - const int broadcast = (a_is_broadcasted || b_is_broadcasted); + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; - if( (out_type != ScalarType::Float) || (alpha_val != 1.0)) - fall_back = 1; + if((out_type != ScalarType::Float) || (alpha_val != 1.0)) + optimized = 0; - if( (a_dim == 0) || (b_dim == 0) ) - fall_back = 1; + if((a_dim == 0) || (b_dim == 0)) + optimized = 0; - if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) - fall_back = 1; + if((broadcast == 1) && (max_dim > kNnlibMaxDim)) + optimized = 0; - if(!fall_back) + if(optimized) { /*logic to find broadcast*/ const int a_is_broadcasted = !out.sizes().equals(a.sizes()); @@ -129,20 +128,20 @@ Tensor& sub_out( float* const out_data = out.mutable_data_ptr(); if(broadcast == 1) { - int out_shape[NNLIB_MAX_DIM]; - int inp1_shape[NNLIB_MAX_DIM]; - int inp2_shape[NNLIB_MAX_DIM]; + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; - for(int i = 0; i < NNLIB_MAX_DIM; i++) + for(int i = 0; i < kNnlibMaxDim; i++) { out_shape[i] = 1; inp1_shape[i] = 1; inp2_shape[i] = 1; } - int off_o = NNLIB_MAX_DIM - out_dim; - int off_a = NNLIB_MAX_DIM - a_dim; - int off_b = NNLIB_MAX_DIM - b_dim; + int off_o = kNnlibMaxDim - out_dim; + int off_a = kNnlibMaxDim - a_dim; + int off_b = kNnlibMaxDim - b_dim; for(int i = 0; i < out_dim; i++) out_shape[i+off_o] = out.size(i); for(int i = 0; i < a_dim; i++) @@ -156,29 +155,28 @@ Tensor& sub_out( { xa_nn_elm_sub_f32xf32_f32(out_data, a_data, b_data, out.numel()); } - + + return out; } - else - { + - ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() { - using CTYPE_IN = typename torch::executor:: - promote_types::type; - ET_DCHECK(CppTypeToScalarType::value == common_type); - CTYPE_IN alpha_val; - utils::extract_scalar(alpha, &alpha_val); - ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { - SubInner< - can_cast::value, - CTYPE_A, - CTYPE_B, - CTYPE_IN, - CTYPE_OUT>::run(a, b, alpha_val, out); - }); - }); - }); - } + ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + CTYPE_IN alpha_val; + utils::extract_scalar(alpha, &alpha_val); + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + SubInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, alpha_val, out); + }); + }); + }); return out; } @@ -211,6 +209,8 @@ Tensor& sub_scalar_out( ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); ET_KERNEL_CHECK(ctx, canCast(alpha_type, common_type), InvalidArgument, out); + /*When Half first compute the result in float precision + and then downcast to half*/ if (common_type == ScalarType::Half) { common_type = ScalarType::Float; } diff --git a/backends/cadence/hifi/operators/op_tanh.cpp b/backends/cadence/hifi/operators/op_tanh.cpp index 47a057fe774..f06b57a3688 100644 --- a/backends/cadence/hifi/operators/op_tanh.cpp +++ b/backends/cadence/hifi/operators/op_tanh.cpp @@ -9,7 +9,7 @@ #include #include #include -#include "kernels.h" +#include namespace torch { namespace executor { @@ -17,21 +17,19 @@ namespace native { Tensor& tanh_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { - int fall_back = 0; + bool optimized = 1; if((in.scalar_type() != ScalarType::Float) || (out.scalar_type() != ScalarType::Float)) - fall_back = 1; + optimized = 0; - if(!fall_back) + if(optimized) { float* data_in = in.mutable_data_ptr(); float* data_out = out.mutable_data_ptr(); xa_nn_vec_tanh_f32_f32(data_out, data_in, (int)in.numel()); return out; } - else - { - return internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out); - } + + return internal::unary_ufunc_realhb_to_floath(std::tanh, ctx, in, out); }