From 829b05bddd154f9ea5d352aa51b15edcadad3051 Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Sun, 11 Aug 2024 21:56:21 -0700 Subject: [PATCH 1/6] Adding operators NNLIB interated operators --- backends/cadence/aot/functions.yaml | 22 +- backends/cadence/hifi/kernels/CMakeLists.txt | 7 + backends/cadence/hifi/kernels/kernels.h | 81 + .../cadence/hifi/operators/CMakeLists.txt | 17 +- backends/cadence/hifi/operators/op_atan2.cpp | 212 ++ .../cadence/hifi/operators/op_bitwise_and.cpp | 310 +++ .../cadence/hifi/operators/op_bitwise_or.cpp | 309 +++ .../cadence/hifi/operators/op_bitwise_xor.cpp | 311 +++ backends/cadence/hifi/operators/op_cat.cpp | 180 ++ backends/cadence/hifi/operators/op_eq.cpp | 175 ++ .../hifi/operators/op_floor_divide.cpp | 130 ++ backends/cadence/hifi/operators/op_fmod.cpp | 262 +++ backends/cadence/hifi/operators/op_ge.cpp | 177 ++ backends/cadence/hifi/operators/op_gt.cpp | 177 ++ backends/cadence/hifi/operators/op_le.cpp | 177 ++ backends/cadence/hifi/operators/op_lt.cpp | 178 ++ backends/cadence/hifi/operators/op_ne.cpp | 176 ++ backends/cadence/hifi/operators/op_pow.cpp | 380 +++ .../cadence/hifi/operators/op_remainder.cpp | 233 ++ backends/cadence/hifi/operators/op_rsqrt.cpp | 7 +- .../hifi/operators/quantized_conv_out.cpp | 637 ++++++ .../hifi/operators/quantized_matmul_out.cpp | 186 ++ .../hifi/operators/quantized_relu_out.cpp | 51 + .../third-party/nnlib/xa_nn_broadcast_32.c | 313 +++ .../hifi/third-party/nnlib/xa_nn_concat_32.c | 172 ++ .../nnlib/xa_nn_elm_fmod_broadcast_f32.c | 525 +++++ .../nnlib/xa_nn_elm_logicalxor_bool_bool.c | 52 + .../nnlib/xa_nn_elm_remainder_broadcast_f32.c | 525 +++++ .../nnlib/xa_nn_greater_lesser_equal_f32.c | 2029 +++++++++++++++++ 29 files changed, 8004 insertions(+), 7 deletions(-) create mode 100644 backends/cadence/hifi/operators/op_atan2.cpp create mode 100644 backends/cadence/hifi/operators/op_bitwise_and.cpp create mode 100644 backends/cadence/hifi/operators/op_bitwise_or.cpp create mode 100644 backends/cadence/hifi/operators/op_bitwise_xor.cpp create mode 100644 backends/cadence/hifi/operators/op_cat.cpp create mode 100644 backends/cadence/hifi/operators/op_eq.cpp create mode 100644 backends/cadence/hifi/operators/op_floor_divide.cpp create mode 100644 backends/cadence/hifi/operators/op_fmod.cpp create mode 100644 backends/cadence/hifi/operators/op_ge.cpp create mode 100644 backends/cadence/hifi/operators/op_gt.cpp create mode 100644 backends/cadence/hifi/operators/op_le.cpp create mode 100644 backends/cadence/hifi/operators/op_lt.cpp create mode 100644 backends/cadence/hifi/operators/op_ne.cpp create mode 100644 backends/cadence/hifi/operators/op_pow.cpp create mode 100644 backends/cadence/hifi/operators/op_remainder.cpp create mode 100644 backends/cadence/hifi/operators/quantized_conv_out.cpp create mode 100644 backends/cadence/hifi/operators/quantized_matmul_out.cpp create mode 100644 backends/cadence/hifi/operators/quantized_relu_out.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index f79d5f870da..99937b4d495 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -20,7 +20,7 @@ - op: _softmax.out kernels: - arg_meta: null - kernel_name: torch::executor::softmax_out + kernel_name: torch::executor::softmax_out - op: add.out kernels: @@ -66,6 +66,16 @@ kernels: - arg_meta: null kernel_name: torch::executor::mul_out + +- op: mul.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::mul_scalar_out + +- op: mean.out + kernels: + - arg_meta: null + kernel_name: torch::executor::mean_dim_out - op: permute_copy.out kernels: @@ -101,6 +111,11 @@ kernels: - arg_meta: null kernel_name: torch::executor::where_out + +- op: rsqrt.out + kernels: + - arg_meta: null + kernel_name: torch::executor::rsqrt_out # custom ops - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) @@ -134,3 +149,8 @@ kernels: - arg_meta: null kernel_name: impl::reference::quantized_relu_out + +- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::reference::quantized_matmul_out diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 7bb143f917e..df2f9db72fb 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -11,6 +11,13 @@ add_library( ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_broadcast_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_floor_div_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_floor_div_broadcast_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c ) target_include_directories( diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 54d1914b89d..8119bcf8a67 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -15,6 +15,87 @@ /* For NNLIB APIs */ #include "xa_nnlib_kernels_api.h" +extern "C" WORD32 xa_nn_elm_logicalxor_boolxbool_bool( + WORD8 * __restrict__ p_out, + const WORD8 * __restrict__ p_inp1, + const WORD8 * __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_remainder_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_remainder_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape); + +extern "C" WORD32 xa_nn_elm_fmod_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_fmod_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape); + +extern "C" WORD32 xa_nn_elm_floor_div_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_floor_div_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape); + +extern "C" WORD32 xa_nn_broadcast_32_32( + WORD32* __restrict__ p_out, + const int *const out_shape, + WORD32* __restrict__ p_in, + const int * const in_shape, + int num_dims); + +extern "C" WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32( + WORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + WORD32 kernel_type); + +extern "C" WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( + WORD8 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 kernel_type); + +extern "C" WORD32 xa_nn_concat_32_32( + WORD32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const WORD32 **pp_inps, + const WORD32 *const *pp_inps_shape, + WORD32 num_out_dims, + WORD32 num_inp, + WORD32 num_inp_dims, + WORD32 axis); + extern "C" WORD32 xa_nn_elm_floor_div_f32xf32_f32( FLOAT32 * __restrict__ p_out, const FLOAT32 * __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index bf533bf8873..553c98c03ab 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -36,7 +36,22 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_floor_divide.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_fmod.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_lt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_le.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_gt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ge.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_eq.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ne.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_remainder.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_and.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_or.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_xor.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp" @@ -62,7 +77,7 @@ target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. add_library( custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp") + "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" "quantized_matmul_out.cpp" "quantized_relu_out.cpp" "quantized_conv_out.cpp") target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} ${_common_include_directories}) diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp new file mode 100644 index 00000000000..5524b02510f --- /dev/null +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include"kernels.h" + +#define NNLIB_OPT 0 + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& +atan2_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + +#if NNLIB_OPT + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_is_broadcasted = (a_is_broadcasted && b_is_broadcasted); + + WORD32 num_elm = out.numel(); + + if(both_is_broadcasted) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + val = xa_nn_broadcast_32_32(ptr2, + p_out_shape, + pin2, + p_inp2_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr2; + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + free(ptr2); + } + else if(a_is_broadcasted && (!b_is_broadcasted)) + { + FLOAT32* __restrict__ ptr1 = (FLOAT32* __restrict__ )malloc((num_elm + 2) * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + WORD32 val = xa_nn_broadcast_32_32((WORD32 *)ptr1, + p_out_shape, + (WORD32 *)pin1, + p_inp1_shape, + 4); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm); + + free(ptr1); + } + else if(b_is_broadcasted && (!a_is_broadcasted)) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr1; + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecatan2f (p_out, + p_inp1, + p_inp2, + num_elm ); + } +#else + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REALHB_TYPES(a_type, ctx, "atan2.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "atan2.out", CTYPE_B, [&]() { + ET_SWITCH_FLOATH_TYPES(out_type, ctx, "atan2.out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_OUT casted_a = static_cast(val_a); + CTYPE_OUT casted_b = static_cast(val_b); + return static_cast(std::atan2(casted_a, casted_b)); + }, + a, + b, + out); + }); + }); + }); + +#endif + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_bitwise_and.cpp b/backends/cadence/hifi/operators/op_bitwise_and.cpp new file mode 100644 index 00000000000..bb71e8843b8 --- /dev/null +++ b/backends/cadence/hifi/operators/op_bitwise_and.cpp @@ -0,0 +1,310 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// patternlint-disable-next-line executorch-cpp-nostdinc +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& bitwise_and_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + + if(common_type == ScalarType::Bool) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_broadcasted = a_is_broadcasted && b_is_broadcasted; + + if(both_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + WORD8* __restrict__ ptr2 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pin2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + xa_nn_broadcast_8_8( + ptr2, + p_out_shape, + pin2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr2; + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(a_is_broadcasted && !b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + xa_nn_broadcast_8_8(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(!a_is_broadcasted && b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pinp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pinp2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else + { + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 num_elm = out.numel(); + + xa_nn_elm_logicaland_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_and.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_INT_TYPES_AND( + Bool, b_type, ctx, "bitwise_and.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_and.Tensor_out", + CTYPE_OUT, + [&]() { + internal::BitwiseOpInner< + can_cast::value, + std::bit_and, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& bitwise_and_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_and.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_INTB_TYPES( + b_type, ctx, "bitwise_and.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_INT_TYPES_AND( + Bool, + common_type, + ctx, + "bitwise_and.Scalar_out", + CTYPE_IN, + [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_and.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = std::bit_and()( + a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_bitwise_or.cpp b/backends/cadence/hifi/operators/op_bitwise_or.cpp new file mode 100644 index 00000000000..33e378a9b29 --- /dev/null +++ b/backends/cadence/hifi/operators/op_bitwise_or.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// patternlint-disable-next-line executorch-cpp-nostdinc +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& bitwise_or_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + + if(common_type == ScalarType::Bool) + { + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_broadcasted = a_is_broadcasted && b_is_broadcasted; + + if(both_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + WORD8* __restrict__ ptr2 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pin2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + xa_nn_broadcast_8_8( + ptr2, + p_out_shape, + pin2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr2; + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(a_is_broadcasted && !b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + xa_nn_broadcast_8_8(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(!a_is_broadcasted && b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pinp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pinp2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else + { + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 num_elm = out.numel(); + + xa_nn_elm_logicalor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_or.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_INT_TYPES_AND( + Bool, b_type, ctx, "bitwise_or.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_or.Tensor_out", + CTYPE_OUT, + [&]() { + internal::BitwiseOpInner< + can_cast::value, + std::bit_or, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& bitwise_or_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_or.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_INTB_TYPES( + b_type, ctx, "bitwise_or.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_INT_TYPES_AND( + Bool, + common_type, + ctx, + "bitwise_or.Scalar_out", + CTYPE_IN, + [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_or.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = + std::bit_or()(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_bitwise_xor.cpp b/backends/cadence/hifi/operators/op_bitwise_xor.cpp new file mode 100644 index 00000000000..e051f0d0fdc --- /dev/null +++ b/backends/cadence/hifi/operators/op_bitwise_xor.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// patternlint-disable-next-line executorch-cpp-nostdinc +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& bitwise_xor_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + + if(common_type == ScalarType::Bool) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_broadcasted = a_is_broadcasted && b_is_broadcasted; + + if(both_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + WORD8* __restrict__ ptr2 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pin2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + xa_nn_broadcast_8_8( + ptr2, + p_out_shape, + pin2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr2; + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(a_is_broadcasted && !b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ pin1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + xa_nn_broadcast_8_8(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + 4); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else if(!a_is_broadcasted && b_is_broadcasted) + { + WORD32 num_elm = out.numel(); + + WORD8* __restrict__ ptr1 = (WORD8* __restrict__ )malloc(num_elm); + + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ pinp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + xa_nn_broadcast_8_8( + ptr1, + p_out_shape, + pinp2, + p_inp2_shape, + 4); + + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)ptr1; + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + else + { + const WORD8 * __restrict__ p_inp1 = (const WORD8 * __restrict__)a.const_data_ptr(); + const WORD8 * __restrict__ p_inp2 = (const WORD8 * __restrict__)b.const_data_ptr(); + + WORD8 * __restrict__ p_out = (WORD8 * __restrict__)out.mutable_data_ptr(); + + WORD32 num_elm = out.numel(); + + xa_nn_elm_logicalxor_boolxbool_bool( + p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_xor.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_INT_TYPES_AND( + Bool, b_type, ctx, "bitwise_xor.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_xor.Tensor_out", + CTYPE_OUT, + [&]() { + internal::BitwiseOpInner< + can_cast::value, + std::bit_xor, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& bitwise_xor_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_INT_TYPES_AND( + Bool, a_type, ctx, "bitwise_xor.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_INTB_TYPES( + b_type, ctx, "bitwise_xor.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_INT_TYPES_AND( + Bool, + common_type, + ctx, + "bitwise_xor.Scalar_out", + CTYPE_IN, + [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, + out_type, + ctx, + "bitwise_xor.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = std::bit_xor()( + a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp new file mode 100644 index 00000000000..79e6129bf18 --- /dev/null +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include "kernels.h" +#include "stdio.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& cat_out( + RuntimeContext& ctx, + exec_aten::ArrayRef tensors, + int64_t dim, + Tensor& out) { + + if(out.scalar_type()== ScalarType::Float){ + WORD32 num_inp = tensors.size(); + WORD32 num_inp_dims = tensors[0].dim(); + WORD32 num_out_dims = num_inp_dims; + WORD32 axis = dim; + + WORD32 inp_shape[16][16]; + WORD32 p_out_shape[16] = {0}; + + WORD32 *ptr_shape[16]; + const WORD32 *ptr[16]; + + for(int i = 0; i < num_inp; i++) + { + ptr[i] = (const WORD32 *)tensors[i].const_data_ptr(); + for(int j = 0; j < num_inp_dims; j++) + { + inp_shape[i][j] = tensors[i].size(j); + if(j == axis) + p_out_shape[j] += inp_shape[i][j]; + else + p_out_shape[j] = inp_shape[i][j]; + } + + ptr_shape[i] = inp_shape[i]; + } + + const WORD32 **pp_inps = &ptr[0]; + + WORD32 * p_out = (WORD32 *)out.mutable_data_ptr(); + + const WORD32 *const *pp_inps_shape = (const WORD32 *const *)&ptr_shape[0]; + + WORD32 val = xa_nn_concat_32_32(p_out + ,p_out_shape + ,pp_inps + ,pp_inps_shape + ,num_out_dims + ,num_inp + ,num_inp_dims + ,axis); + + return out; + } + else if(out.scalar_type() == ScalarType::Char){ + WORD32 num_inp = tensors.size(); + WORD32 num_inp_dims = tensors[0].dim(); + WORD32 num_out_dims = num_inp_dims; + WORD32 axis = dim; + + WORD32 inp_shape[16][16]; + WORD32 p_out_shape[16] = {0}; + + WORD32 *ptr_shape[16]; + const WORD8 *ptr[16]; + + for(int i = 0; i < num_inp; i++) + { + ptr[i] = (const WORD8 *)tensors[i].const_data_ptr(); + for(int j = 0; j < num_inp_dims; j++) + { + inp_shape[i][j] = tensors[i].size(j); + if(j == axis) + p_out_shape[j] += inp_shape[i][j]; + else + p_out_shape[j] = inp_shape[i][j]; + } + + ptr_shape[i] = inp_shape[i]; + } + + const WORD8 **pp_inps = &ptr[0]; + + WORD8 * p_out = (WORD8 *)out.mutable_data_ptr(); + + const WORD32 *const *pp_inps_shape = (const WORD32 *const *)&ptr_shape[0]; + + WORD32 val = xa_nn_concat_8_8(p_out + ,p_out_shape + ,pp_inps + ,pp_inps_shape + ,num_out_dims + ,num_inp + ,num_inp_dims + ,axis); + + return out; + } + else { + + if (dim < 0) { + dim += out.dim(); + } + + ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), InvalidArgument, out); + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); + ET_CHECK( + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok); + + // Special handling when all inputs are 1D-empty tensors for aten consistency + // In that case, just return an 1D-empty tensor without checking dim + bool all_1d_empty = true; + for (size_t i = 0; i < tensors.size(); ++i) { + if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { + all_1d_empty = false; + break; + } + } + if (all_1d_empty) { + return out; + } + + const size_t outer = getLeadingDims(out, dim); + const size_t dim_stride = getTrailingDims(out, dim); + const size_t ninputs = tensors.size(); + + const auto out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "cat", CTYPE_OUT, [&] { + CTYPE_OUT* out_ptr = out.mutable_data_ptr(); + for (size_t i = 0; i < outer; ++i) { + for (size_t j = 0; j < ninputs; ++j) { + const auto in_type = tensors[j].scalar_type(); + ET_SWITCH_REAL_TYPES_AND(Bool, in_type, ctx, "cat", CTYPE_IN, [&] { + if (tensors[j].numel() == 0) { + return; + } + size_t inner = tensors[j].size(dim) * dim_stride; + + const CTYPE_IN* const in_ptr = + tensors[j].const_data_ptr() + i * inner; + + for (size_t k = 0; k < inner; ++k) { + out_ptr[k] = static_cast(in_ptr[k]); + } + out_ptr += inner; + }); + } + } + }); + } + + return out; +#endif +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_eq.cpp b/backends/cadence/hifi/operators/op_eq.cpp new file mode 100644 index 00000000000..8b66dc85a33 --- /dev/null +++ b/backends/cadence/hifi/operators/op_eq.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& eq_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 4); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 4); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "eq.Scalar_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted == b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& eq_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "eq.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "eq.Scalar_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "eq.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted == b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_floor_divide.cpp b/backends/cadence/hifi/operators/op_floor_divide.cpp new file mode 100644 index 00000000000..0514df0ca25 --- /dev/null +++ b/backends/cadence/hifi/operators/op_floor_divide.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner { + static void + run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) { + if (is_integral_type::value) { + if (val_b == 0) { + div_by_zero_error = true; + return static_cast(0); + } + } + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = utils::floor_divide(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&, bool&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FloorDivideInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& floor_divide_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ET_KERNEL_CHECK(ctx, tensor_is_real_type(out), InvalidArgument, out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + auto div_by_zero_error = false; + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "floor_divide.out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "floor_divide.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "floor_divide.out", CTYPE_OUT, [&]() { + FloorDivideInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out, div_by_zero_error); + }); + }); + }); + + ET_KERNEL_CHECK_MSG( + ctx, + !div_by_zero_error, + InvalidArgument, + out, + "Floor divide operation encountered integer division by zero"); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_fmod.cpp b/backends/cadence/hifi/operators/op_fmod.cpp new file mode 100644 index 00000000000..a665cda0e0f --- /dev/null +++ b/backends/cadence/hifi/operators/op_fmod.cpp @@ -0,0 +1,262 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner { + static void + run(const Tensor& a, const Tensor& b, Tensor& out, bool& div_by_zero_error) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [&div_by_zero_error](const CTYPE_A val_a, const CTYPE_B val_b) { + if (is_integral_type::value) { + if (val_b == 0) { + div_by_zero_error = true; + return static_cast(0); + } + } + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::fmod(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&, bool&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct FmodInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& fmod_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_fmod_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_fmod_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + auto div_by_zero_error = false; + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "fmod.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "fmod.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "fmod.Tensor_out", CTYPE_OUT, [&]() { + FmodInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out, div_by_zero_error); + }); + }); + }); + + ET_KERNEL_CHECK_MSG( + ctx, + !div_by_zero_error, + InvalidArgument, + out, + "Fmod operation encountered integer division by zero"); + } + + return out; +} + +Tensor& fmod_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + // Check for integer division by zero + if (isIntegralType(common_type, /*includeBool=*/true)) { + auto is_zero = false; + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "fmod.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + is_zero = (val_b == 0); + }); + + ET_KERNEL_CHECK_MSG( + ctx, + !is_zero, + InvalidArgument, + out, + "Fmod operation encountered integer division by zero"); + } + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "fmod.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES( + b_type, ctx, "fmod.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_REAL_TYPES( + common_type, ctx, "fmod.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES( + out_type, ctx, "fmod.Scalar_out", CTYPE_OUT, [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = std::fmod(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_ge.cpp b/backends/cadence/hifi/operators/op_ge.cpp new file mode 100644 index 00000000000..f81b981442f --- /dev/null +++ b/backends/cadence/hifi/operators/op_ge.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& ge_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 0); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 0); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ge.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "ge.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ge.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted >= b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& ge_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ge.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "ge.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "ge.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ge.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted >= b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_gt.cpp b/backends/cadence/hifi/operators/op_gt.cpp new file mode 100644 index 00000000000..4f0d6aec32d --- /dev/null +++ b/backends/cadence/hifi/operators/op_gt.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& gt_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 1); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 1); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "gt.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "gt.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "gt.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted > b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& gt_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "gt.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "gt.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "gt.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "gt.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted > b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_le.cpp b/backends/cadence/hifi/operators/op_le.cpp new file mode 100644 index 00000000000..70673399bb6 --- /dev/null +++ b/backends/cadence/hifi/operators/op_le.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& le_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 2); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 2); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "le.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "le.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "le.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted <= b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& le_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "le.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "le.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "le.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "le.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted <= b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_lt.cpp b/backends/cadence/hifi/operators/op_lt.cpp new file mode 100644 index 00000000000..315e1457956 --- /dev/null +++ b/backends/cadence/hifi/operators/op_lt.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& lt_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 3); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 3); + } + } + else + { + + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "lt.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "lt.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "lt.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted < b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& lt_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "lt.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "lt.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, common_type, ctx, "lt.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "lt.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted < b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_ne.cpp b/backends/cadence/hifi/operators/op_ne.cpp new file mode 100644 index 00000000000..8a00211f6c4 --- /dev/null +++ b/backends/cadence/hifi/operators/op_ne.cpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; + +Tensor& ne_tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape, + 5); + } + else + { + WORD8 * __restrict__ p_out = (WORD8 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm, + 5); + } + } + else + { + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ne.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "ne.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == + promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ne.Tensor_out", CTYPE_OUT, [&]() { + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted != b_casted; + return static_cast(value); + }, + a, + b, + out); + }); + }); + }); + } + + return out; +} + +Tensor& ne_scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType out_type = out.scalar_type(); + + ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "ne.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "ne.Scalar_out", CTYPE_B, [&]() { + using CTYPE_IN = + typename torch::executor::promote_types::type; + ET_DCHECK( + CppTypeToScalarType::value == promoteTypes(a_type, b_type)); + ET_SWITCH_REAL_TYPES_AND( + Bool, out_type, ctx, "ne.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + bool value = a_casted != b_casted; + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp new file mode 100644 index 00000000000..fee28706f5c --- /dev/null +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -0,0 +1,380 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +#include"kernels.h" + +#define NNLIB_OPT 0 + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& pow_Tensor_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { +#if NNLIB_OPT + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool both_is_broadcasted = (a_is_broadcasted && b_is_broadcasted); + + WORD32 num_elm = out.numel(); + + if(both_is_broadcasted) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + val = xa_nn_broadcast_32_32(ptr2, + p_out_shape, + pin2, + p_inp2_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr2; + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + free(ptr2); + } + else if(a_is_broadcasted && (!b_is_broadcasted)) + { + FLOAT32* __restrict__ ptr1 = (FLOAT32* __restrict__ )malloc((num_elm + 2) * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + + WORD32 val = xa_nn_broadcast_32_32((WORD32 *)ptr1, + p_out_shape, + (WORD32 *)pin1, + p_inp1_shape, + 4); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)ptr1; + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm); + + free(ptr1); + } + else if(b_is_broadcasted && (!a_is_broadcasted)) + { + WORD32* __restrict__ ptr1 = (WORD32* __restrict__ )malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = b.size(i); + + WORD32 val = xa_nn_broadcast_32_32(ptr1, + p_out_shape, + pin1, + p_inp1_shape, + out.dim()); + + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)ptr1; + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm ); + + free(ptr1); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + vecpowf (p_out, + p_inp1, + p_inp2, + num_elm ); + } +#else + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK( + ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out); + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REALHB_TYPES(a_type, ctx, "pow.Tensor_Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES( + b_type, ctx, "pow.Tensor_Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALH_TYPES( + out_type, ctx, "pow.Tensor_Tensor_out", CTYPE_OUT, [&]() { + PowInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); +#endif + return out; +} + +Tensor& pow_Tensor_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = + utils::promote_type_with_scalar(a_type, b, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, "pow.Tensor_Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES( + b_type, ctx, "pow.Tensor_Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES( + common_type, ctx, "pow.Tensor_Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES( + out_type, ctx, "pow.Tensor_Scalar_out", CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +Tensor& pow_Scalar_out( + RuntimeContext& ctx, + const Scalar& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, b.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = utils::get_scalar_dtype(a); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = + utils::promote_type_with_scalar(b_type, a, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, "pow.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "pow.Scalar_out", CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, "pow.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES( + out_type, ctx, "pow.Scalar_out", CTYPE_OUT, [&]() { + CTYPE_A val_a = 0; + utils::extract_scalar(a, &val_a); + + apply_unary_map_fn( + [val_a](const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + b.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp new file mode 100644 index 00000000000..c1fe48fc7db --- /dev/null +++ b/backends/cadence/hifi/operators/op_remainder.cpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = utils::remainder_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner + : public ReportCanCastBug {}; + +} // namespace +Tensor& remainder_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + if((a.scalar_type() == ScalarType::Float)||(b.scalar_type() == ScalarType::Float)) + { + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); + + if(any_is_broadcasted) + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[4]; + WORD32 p_inp1_shape[4]; + WORD32 p_inp2_shape[4]; + + for(int i = 0; i < 4; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = 4 - out.dim(); + int off_a = 4 - a.dim(); + int off_b = 4 - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 val = xa_nn_elm_remainder_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape); + } + else + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + WORD32 num_elm = out.numel(); + + WORD32 val = xa_nn_elm_remainder_f32xf32_f32(p_out, + p_inp1, + p_inp2, + num_elm); + } + } + else + { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() { + RemainderInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + } + + return out; +} + +Tensor& remainder_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = utils::get_scalar_dtype(b); + ScalarType common_type = utils::promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "remainder.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES( + b_type, ctx, "remainder.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + utils::extract_scalar(b, &val_b); + ET_SWITCH_REAL_TYPES( + common_type, ctx, "remainder.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES( + out_type, + ctx, + "remainder.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = utils::remainder_override( + a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp index e5e60bf783e..4cf295f98ae 100644 --- a/backends/cadence/hifi/operators/op_rsqrt.cpp +++ b/backends/cadence/hifi/operators/op_rsqrt.cpp @@ -35,13 +35,10 @@ Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { p_out, p_inp, num_elm); - - return out; - } + return out; + } else - { return internal::unary_ufunc_realhb_to_floath(rsqrt, ctx, in, out); - } } diff --git a/backends/cadence/hifi/operators/quantized_conv_out.cpp b/backends/cadence/hifi/operators/quantized_conv_out.cpp new file mode 100644 index 00000000000..60a78f0102d --- /dev/null +++ b/backends/cadence/hifi/operators/quantized_conv_out.cpp @@ -0,0 +1,637 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "kernels.h" + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x))+(bytes-1))&(~(bytes-1))) + +#define NNLIB_OPT 0 + +namespace impl { +namespace HiFi { +namespace native { + +using Tensor = exec_aten::Tensor; +using RuntimeContext = torch::executor::RuntimeContext; +using ScalarType = exec_aten::ScalarType; + +// This implements a generic 2d conv kernel that operates on raw pointers. +// The version handles both quantized and fp32 convolutions. +// The input is of shape [n x c x h x w] +// The weight is of shape [oc x wc x wh x ww], where wc == c +// The output is of shape [n x oc x oh x ow] +// The bias is of shape [oc] +template +__attribute__((noinline)) void conv2d_nchw_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t c, + int32_t h, + int32_t w, + int32_t oc, + int32_t wc, + int32_t wh, + int32_t ww, + int32_t oh, + int32_t ow, + // Stride + int16_t s0, + int16_t s1, + // Padding + int16_t p0, + int16_t p1, + // Dilation + int16_t d0, + int16_t d1, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + const int32_t* __restrict__ weight_zero_point = nullptr, + const float* __restrict__ bias_scale = nullptr, + float out_scale = 1, + OT out_zero_point = 0, + bool per_tensor_quantized = true) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * c * h * w; + OT* out_batch = p_out + _n * oc * oh * ow; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + OT* out_plane = out_batch + _oc * oh * ow; + const WT* weight_batch = p_weight + _oc * wc * wh * ww; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of size + // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an + // output channel of size 1 x oh x ow. + for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) { + for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) { + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to the + // output channel being computed) with the corresponding weight + // channel. + // If the padding is 0, and dilation is 1, then we can remove the + // unnecessary checks, and simplify the code so that it can be + // vectorized by Tensilica compiler. + if (zero_pad_unit_dilation) { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + int ioff = (_h + _wh) * w + (_w + _ww); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + (quantized ? 0 : 0); + /*float rhs = weight_plane[woff] - + (quantized ? weight_zero_point[0] : 0);*/ + acc += lhs * rhs; + } + } + } + } else { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_h + d0 * _wh - p0) >= 0) && + ((_h + d0 * _wh - p0) < h) && + ((_w + d1 * _ww - p1) >= 0) && + ((_w + d1 * _ww - p1) < w)) { + //((_w + d1 * _ww - p1 < w))) { + int ioff = + (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + (quantized ? 0 : 0); + /*float rhs = weight_plane[woff] - + (quantized ? weight_zero_point[0] : 0);*/ + acc += lhs * rhs; + } + } + } + } + } + if (quantized) { + float val = + (per_tensor_quantized ? bias_scale[0] : bias_scale[_oc]) * + acc; + out_plane[_oh * ow + _ow] = + kernels::quantize(val, inv_out_scale, out_zero_point); + } else { + out_plane[_oh * ow + _ow] = acc; + } + } + } + } + } + } +} + +// The quantized convolution kernel. in_scale and weight_scale are implicit in +// bias_scale, since it is a product of the two. The kernel will branch to +// quantized::conv1d or quantized::conv2d based on the dimensionality of +// activation tensor. +void quantized_conv_out( + RuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + exec_aten::IntArrayRef stride, + exec_aten::IntArrayRef padding, + exec_aten::IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + bool channel_last, + Tensor& out) { + bool conv1d = input.dim() == 3; + +#if NNLIB_OPT + + if(input.scalar_type() == ScalarType::Char) + { + WORD8* __restrict__ p_out = (WORD8* __restrict__ )out.mutable_data_ptr(); + WORD8* __restrict__ p_inp = (WORD8* __restrict__ )input.const_data_ptr(); + WORD8* __restrict__ p_kernel = (WORD8* __restrict__ )weight.const_data_ptr(); + WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr(); + + WORD32 input_height = conv1d ? 1 : input.size(2); + WORD32 input_width = conv1d ? input.size(2) : input.size(3); + WORD32 input_channels = input.size(1); + WORD32 kernel_height = conv1d ? 1 : weight.size(2); + WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + WORD32 kernel_channels = weight.size(1); + WORD32 out_channels = weight.size(0); + WORD32 out_height = conv1d ? 1 : out.size(2); + WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 batches = input.size(0); + + WORD32 x_stride = stride[1]; + WORD32 y_stride = stride[0]; + WORD32 x_padding = padding[1]; + WORD32 y_padding = padding[0]; + WORD32 dilation_width = 1; + WORD32 dilation_height = 1; + + WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr(); + + WORD32 input_zero_bias = -in_zero_point; + WORD32 kernel_zero_bias = -kernel_bias_ptr[0]; + + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + + float out_scale = 1. / output_scale; + + for(int i = 0; i < out_channels; i++) + { + out_multiplier32[i] = bias_scale.const_data_ptr()[0] * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 inp_precision = 8; + WORD32 kernel_precision = 8; + pVOID p_scratch = nullptr; + WORD8 *ptr_scratch; + + WORD32 scratch_size = 0; + + WORD32 out_data_format = 1; + WORD32 inp_data_format = 0; + + WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8)); + WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8)); + + WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8); + WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_height; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims = 4; + WORD32 num_inp_dims = 4; + + WORD32 t = xa_nn_transpose_8_8(pin + ,p_out_shape + ,p_inp + ,p_inp_shape + ,p_permute_vec + ,num_out_dims + ,num_inp_dims); + + WORD32 p_inp_shape1[4]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[4]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + WORD32 p_permute_vec1[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims1 = 4; + WORD32 num_inp_dims1 = 4; + + WORD32 t1 = xa_nn_transpose_8_8(pkernel + ,p_out_shape1 + ,p_kernel + ,p_inp_shape1 + ,p_permute_vec1 + ,num_out_dims1 + ,num_inp_dims1); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size=scratch_size<0?0:scratch_size; + + ptr_scratch = (WORD8 *)malloc(scratch_size + 16); + + p_scratch = (xa_codec_handle_t)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; ++_n) { + WORD8 *in_batch = pin + _n * input_channels * input_height * input_width; + WORD8 *out_batch = p_out + _n * out_channels * out_height * out_width; + + WORD32 val = xa_nn_conv2d_per_chan_sym8sxasym8s + (out_batch + ,in_batch + ,pkernel + ,p_bias + ,input_height + ,input_width + ,input_channels + ,kernel_height + ,kernel_width + ,kernel_channels + ,dilation_height + ,dilation_width + ,out_channels + ,x_stride + ,y_stride + ,x_padding + ,y_padding + ,out_height + ,out_width + ,input_zero_bias + ,out_multiplier32 + ,out_shift32 + ,out_zero_bias + ,out_data_format + ,p_scratch + ); + } + + free(ptr1); + free(ptr2); + free(ptr_scratch); + } + else if(input.scalar_type() == ScalarType::Byte) + { + printf("UINT8 CONV KERNEL"); + UWORD8* __restrict__ p_out = (UWORD8* __restrict__ )out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = (UWORD8* __restrict__ )input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = (UWORD8* __restrict__ )weight.const_data_ptr(); + WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr(); + + WORD32 input_height = conv1d ? 1 : input.size(2); + WORD32 input_width = conv1d ? input.size(2) : input.size(3); + WORD32 input_channels = input.size(1); + WORD32 kernel_height = conv1d ? 1 : weight.size(2); + WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + WORD32 kernel_channels = weight.size(1); + WORD32 out_channels = weight.size(0); + WORD32 out_height = conv1d ? 1 : out.size(2); + WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 batches = input.size(0); + + WORD32 x_stride = stride[1]; + WORD32 y_stride = stride[0]; + WORD32 x_padding = padding[1]; + WORD32 y_padding = padding[0]; + WORD32 dilation_width = 1; + WORD32 dilation_height = 1; + + WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr(); + + WORD32 input_zero_bias = -in_zero_point; + WORD32 kernel_zero_bias = -kernel_bias_ptr[0]; + + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + + float out_scale = 1. / output_scale; + + for(int i = 0; i < out_channels; i++) + { + out_multiplier32[i] = bias_scale.const_data_ptr()[0] * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 inp_precision = 8; + WORD32 kernel_precision = 8; + pVOID p_scratch = nullptr; + WORD8 *ptr_scratch; + + WORD32 scratch_size = 0; + + WORD32 out_data_format = 1; + WORD32 inp_data_format = 0; + + WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8)); + WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8)); + + WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8); + WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_channels; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims = 4; + WORD32 num_inp_dims = 4; + + WORD8 * p_tmp = (WORD8 *)p_inp; + + WORD32 t = xa_nn_transpose_8_8(pin + ,p_out_shape + ,p_tmp + ,p_inp_shape + ,p_permute_vec + ,num_out_dims + ,num_inp_dims); + + WORD32 p_inp_shape1[4]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[4]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + WORD32 p_permute_vec1[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims1 = 4; + WORD32 num_inp_dims1 = 4; + + WORD8 * p_tmp1 = (WORD8 *)p_kernel; + + WORD32 t1 = xa_nn_transpose_8_8(pkernel + ,p_out_shape1 + ,p_tmp1 + ,p_inp_shape1 + ,p_permute_vec1 + ,num_out_dims1 + ,num_inp_dims1); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size=scratch_size<0?0:(scratch_size); + + ptr_scratch = (WORD8 *)malloc(scratch_size + 16); + + p_scratch = (xa_codec_handle_t)ALIGN_PTR(ptr_scratch, 8); + + const UWORD8* __restrict__ p_inp1 = (const UWORD8* __restrict__ )pin; + const UWORD8* __restrict__ p_kernel1 = (const UWORD8* __restrict__ )pkernel; + + for (int _n = 0; _n < batches; _n++) { + const UWORD8* __restrict__ in_batch = p_inp1 + _n * input_channels * input_height * input_width; + UWORD8* __restrict__ out_batch = p_out + _n * out_channels * out_height * out_width; + + WORD32 val = xa_nn_conv2d_per_chan_asym8xasym8 + (out_batch + ,in_batch + ,p_kernel1 + ,p_bias + ,input_height + ,input_width + ,input_channels + ,kernel_height + ,kernel_width + ,kernel_channels + ,dilation_height + ,dilation_width + ,out_channels + ,x_stride + ,y_stride + ,x_padding + ,y_padding + ,out_height + ,out_width + ,input_zero_bias + ,out_multiplier32 + ,out_shift32 + ,out_zero_bias + ,out_data_format + ,p_scratch + ); + } + + free(ptr1); + free(ptr2); + free(ptr_scratch); + } + else + { + ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type()); + } + +#else + + // input = [n, c, h, w] + const int n = input.size(0); + const int c = input.size(1); + const int h = conv1d ? 1 : input.size(2); + const int w = conv1d ? input.size(2) : input.size(3); + // weight = [oc, wc, wh, ww] + const int oc = weight.size(0); + const int wc = weight.size(1); + const int wh = conv1d ? 1 : weight.size(2); + const int ww = conv1d ? weight.size(2) : weight.size(3); + // output = [n, oc, oh, ow] + const int oh = conv1d ? 1 : out.size(2); + const int ow = conv1d ? out.size(2) : out.size(3); + + // Bool flag to check if weight tensor is quantized per-tensor or + // per-channel + bool per_tensor_quantized = bias_scale.numel() == 1; + + if(input.scalar_type() == ScalarType::Char) + { + conv2d_nchw_core_generic( + input.const_data_ptr(), + weight.const_data_ptr(), + bias.const_data_ptr(), + out.mutable_data_ptr(), + n, + c, + h, + w, + oc, + wc, + wh, + ww, + oh, + ow, + stride[0], + stride[1], + padding[0], + padding[1], + 1,//dilation[0], + 1,//dilation[1], + groups, + in_zero_point, + weight_zero_point.const_data_ptr(), + bias_scale.const_data_ptr(), + output_scale, + (int8_t)output_zero_point, + per_tensor_quantized); + + } + else if(input.scalar_type() == ScalarType::Byte) + { + conv2d_nchw_core_generic( + input.const_data_ptr(), + weight.const_data_ptr(), + bias.const_data_ptr(), + out.mutable_data_ptr(), + n, + c, + h, + w, + oc, + wc, + wh, + ww, + oh, + ow, + stride[0], + stride[1], + padding[0], + padding[1], + 1,//dilation[0], + 1,//dilation[1], + groups, + in_zero_point, + weight_zero_point.const_data_ptr(), + bias_scale.const_data_ptr(), + output_scale, + (uint8_t)output_zero_point, + per_tensor_quantized); + } + else + { + ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type()); + } +#endif +} + +}; // namespace native +}; // namespace HiFi +}; // namespace impl diff --git a/backends/cadence/hifi/operators/quantized_matmul_out.cpp b/backends/cadence/hifi/operators/quantized_matmul_out.cpp new file mode 100644 index 00000000000..a22cf700d75 --- /dev/null +++ b/backends/cadence/hifi/operators/quantized_matmul_out.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include "kernels.h" + +namespace impl { +namespace HiFi { +namespace native { + +using Tensor = exec_aten::Tensor; +using RuntimeContext = torch::executor::RuntimeContext; + +// The quantized matmul. The quantized matmul accumulates in a wider register, +// whose type is TA. +template < + typename TZ, + typename TA = float, + bool transposed = false, + typename TX = TZ, + typename TY = TZ> +__attribute__((noinline)) void qmatmul( + TZ* __restrict__ Z, + int32_t Z_multiplier, + int32_t Z_shift, + int32_t Z_zero_point, + const TX* __restrict__ X, + int32_t X_zero_point, + const TY* __restrict__ y, + int32_t Y_zero_point, + size_t m, + size_t n, + size_t p) { + // Compute the Z_scale from Z_multiplier and Z_shift + const float Z_scale = -Z_multiplier * 1.0 / (1 << 31) * pow(2, Z_shift); + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < p; ++j) { + TA sum = 0; + for (size_t k = 0; k < n; ++k) { + if (transposed) { + sum += (X[i * n + k] - X_zero_point) * (y[j * n + k] - Y_zero_point); + } else { + sum += (X[i * n + k] - X_zero_point) * (y[k * p + j] - Y_zero_point); + } + } + Z[i * p + j] = kernels::quantize(sum, Z_scale, Z_zero_point); + } + } +} + +template +void inline _typed_quantized_matmul( + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const exec_aten::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& out) { + T* __restrict__ out_data = out.mutable_data_ptr(); + const T* __restrict__ X_data = X.const_data_ptr(); + const T* __restrict__ Y_data = Y.const_data_ptr(); + size_t batch_size = getLeadingDims(X, X.dim() - 2); + size_t leading_dim = X.size(X.dim() - 2); + size_t out_dim = Y.size(Y.dim() - 1 - transposed); + size_t in_dim = X.size(X.dim() - 1); + for (size_t i = 0; i < batch_size; ++i) { + const T* x = X_data + i * leading_dim * in_dim; + const T* y = Y_data + i * in_dim * out_dim; + T* z = out_data + i * leading_dim * out_dim; + if (transposed) { + qmatmul( + z, + static_cast(out_multiplier), + static_cast(out_shift), + static_cast(out_zero_point), + x, + static_cast(X_zero_point), + y, + static_cast(Y_zero_point), + leading_dim, + in_dim, + out_dim); + } else { + qmatmul( + z, + static_cast(out_multiplier), + static_cast(out_shift), + static_cast(out_zero_point), + x, + static_cast(X_zero_point), + y, + static_cast(Y_zero_point), + leading_dim, + in_dim, + out_dim); + } + } +} + +void quantized_matmul_out( + RuntimeContext& ctx, + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const exec_aten::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& out) { + size_t batch_size = getLeadingDims(X, X.dim() - 2); + size_t leading_dim = X.size(X.dim() - 2); + size_t out_dim = Y.size(Y.dim() - 1 - transposed); + size_t in_dim = X.size(X.dim() - 1); + + /*printf("transposed = %d\t", transposed); + printf("m = %d\t", leading_dim); + printf("n = %d\t", in_dim); + printf("p = %d\t", out_dim);*/ + + if (out.scalar_type() == exec_aten::ScalarType::Byte) { + //printf("Byte\n"); + _typed_quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + + /*uint8_t* __restrict__ out_data = out.mutable_data_ptr(); + const uint8_t* __restrict__ X_data = X.const_data_ptr(); + const uint8_t* __restrict__ Y_data = Y.const_data_ptr(); + int bias_tmp[64] = {0}; + const int32_t* __restrict__ bias_data = bias_tmp;//bias.value().const_data_ptr(); + + xa_nn_matmul_asym8uxasym8u_asym8u( + out_data, // p_out + Y_data, // p_mat1, + X_data, // p_mat2, + bias_data, // p_bias + out_dim, // rows of p_mat1 + in_dim, // cols of p_mat1 + in_dim, // row_stride of p_mat1 + leading_dim, // vec_count, i.e., rows of p_mat2 + in_dim, // vec_offset of p_mat2. + out_dim, // out_offset, i.e., offset of next output element written + 1, // out_stride, i.e., stride to go to next output row + -(int32_t)Y_zero_point, // mat1_zero_bias + -(int32_t)X_zero_point, // mat2_zero_bias + (int32_t)out_multiplier, // out_multiplier + (int32_t)out_shift, // out_shift + (int32_t)out_zero_point); // out_zero_bias*/ + + } else if (out.scalar_type() == exec_aten::ScalarType::Char) { + _typed_quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + } +} + +}; // namespace native +}; // namespace HiFi +}; // namespace impl diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp new file mode 100644 index 00000000000..1643747baec --- /dev/null +++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include "kernels.h" + +namespace impl { +namespace HiFi { +namespace native { + +using Tensor = exec_aten::Tensor; +using RuntimeContext = torch::executor::RuntimeContext; + +// Note: this kernel assumes that the input and output share quantization +// parameters. If that is not the case, it will produce incorrect results. +template +void quantized_relu_( + const Tensor& input, + const Tensor& in_zero_point, + Tensor& output) { + T q_zero_point = in_zero_point.const_data_ptr()[0]; + const T* __restrict__ in = input.const_data_ptr(); + T* __restrict__ out = output.mutable_data_ptr(); + + for (size_t i = 0, e = input.numel(); i < e; ++i) { + out[i] = in[i] > q_zero_point ? in[i] : q_zero_point; + } +} + +void quantized_relu_out( + RuntimeContext& ctx, + const Tensor& input, + const Tensor& in_zero_point, + Tensor& output) { + if (input.scalar_type() == exec_aten::ScalarType::Byte) { + quantized_relu_(input, in_zero_point, output); + } else if (input.scalar_type() == exec_aten::ScalarType::Char) { + quantized_relu_(input, in_zero_point, output); + } else { + ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type()); + } +} + +}; // namespace native +}; // namespace HiFi +}; // namespace impl diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c new file mode 100644 index 00000000000..cad3f1a25bb --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_8_8.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c new file mode 100644 index 00000000000..ca80d4ceee5 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c @@ -0,0 +1,172 @@ +#include "xa_type_def.h" +#include "xa_nn_common.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_macros.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_common.h" + +WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out + ,const WORD32 *const p_out_shape + ,const WORD32 **pp_inps + ,const WORD32 *const *pp_inps_shape + ,WORD32 num_out_dims + ,WORD32 num_inp + ,WORD32 num_inp_dims + ,WORD32 axis) +{ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1); + //Validate Arguments + XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1); + XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1); + XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1); + XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1); + + int i = 0, j = 0; + for(i = 0; i < num_out_dims; i++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1); + } + + if(axis < 0) + axis = num_out_dims + axis; + + WORD32 concat_size = 0; + for (i = 0; i < num_inp; i++) + { + XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1); +#pragma loop_count min=1 + for(j = 0; j < num_out_dims; j++) + { + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1); + } + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1); + concat_size += pp_inps_shape[i][axis]; + } + + XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1); + + //Calculate outer and inner size for axis + WORD32 outer_size = 1; +#pragma no_simd + for(int i = 0; i < axis; i++) + { + outer_size *= p_out_shape[i]; + } + + WORD32 base_inner_size = 1; +#pragma no_simd + for(int i = axis + 1; i < num_out_dims; i++) + { + base_inner_size *= p_out_shape[i]; + } + + WORD32 *ptmp_out = p_out; + for(int i = 0; i < num_inp; i++) + { + const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size; + WORD32 *output_ptr = ptmp_out; + const WORD32* input_ptr = pp_inps[i]; + + if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0) + && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0)) + { + if(copy_size <= 8) + { + const ae_f32 *pae_inp = (const ae_f32 *)input_ptr; + for(int k = 0; k < outer_size; k++) + { + ae_f32 *pae_out = (ae_f32 *)output_ptr; +#pragma concurrent +#pragma no_simd + for(int ic = 0; ic < (copy_size >> 1); ic++) + { + *pae_out++ = *pae_inp++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + for(int ic = 0; ic < (copy_size >> 1); ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + const ae_f32 *puae_inp = (const ae_f32 *)pae_inp; + ae_f32 *puae_out = (ae_f32 *)pae_out; +#pragma concurrent + for(int ic = 0; ic < ((copy_size >> 1) & 3); ic++) + { + puae_out[ic] = puae_inp[ic]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + else + { + if(copy_size <= 6) + { + for(int k = 0; k < outer_size; k++) + { +#pragma concurrent +#pragma no_unroll + for(int ic = 0; ic < copy_size; ic++) + { + output_ptr[ic] = *input_ptr++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + + int copy_size_by6 = AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(copy_size, 0x2AAAAAAB))); + int copy_size_rem_start = 6*copy_size_by6; +#pragma concurrent + for(int ic = 0; ic < copy_size_by6; ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + for(int ic = copy_size_rem_start; ic < copy_size; ic++) + { + output_ptr[ic] = input_ptr[ic]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + ptmp_out += copy_size; + } + return 0; +} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c new file mode 100644 index 00000000000..139a97ec3f9 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c @@ -0,0 +1,525 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_fmod_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_fmod_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_DIV_SX2(x1, x2); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_DIV_S(a1, a2); + a = FITRUNC_S(a); + a = XT_MUL_S(a, a2); + a = XT_SUB_S(a1, a); + XT_SSI(a, (xtfloat *)out, 0); + } + + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_fmod_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + /* For computing inp2 - inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x2, x1); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x2, x1); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(b0, a0); + c0 = FITRUNC_S(c0); + c0 = XT_MUL_S(c0, a0); + c0 = XT_SUB_S(b0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + /* For computing inp1 - inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x1, x2); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x1, x2); + y = FITRUNC_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(a0, b0); + c0 = FITRUNC_S(c0); + c0 = XT_MUL_S(c0, b0); + c0 = XT_SUB_S(a0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} + +static void internal_elm_fmod_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + /* For computing inp2 - inp1 */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_fmod_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_fmod_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_fmod_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_fmod_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c new file mode 100644 index 00000000000..752a25b6828 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c @@ -0,0 +1,52 @@ +#include "xa_nnlib_common.h" + +WORD32 xa_nn_elm_logicalxor_boolxbool_bool(WORD8 * __restrict__ p_out, + const WORD8 * __restrict__ p_inp1, + const WORD8 * __restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(WORD8), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + ae_int24x2 *pin1 = (ae_int24x2 *)p_inp1; + ae_int24x2 *pin2 = (ae_int24x2 *)p_inp2; + ae_int24x2 *pout = (ae_int24x2 *)p_out; + int i; + int N = num_elm; + /* Following line divides N by 6. Much faster than compiler implementation. Works for N<32768. */ + /* unsigned int Nby6 = (N*10923)>>16;*/ + /* Following works for all int32 N */ + int Nby6 = AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(N, 0x2AAAAAAB))); + int remainder_start = 6*Nby6; + + ae_valign align_src_in1, align_src_in2, align_dst; + align_src_in1 = AE_LA64_PP(pin1); + align_src_in2 = AE_LA64_PP(pin2); + align_dst = AE_ZALIGN64(); + +/* Loop is unrolled by 6, to use LA24X2/SA24X2 */ + for(i=0; i < Nby6; i++){ + ae_int24x2 vi1, vi2, vo; + AE_LA24X2_IP(vi1, align_src_in1, pin1); + AE_LA24X2_IP(vi2, align_src_in2, pin2); + vo = AE_XOR24(vi1, vi2); + AE_SA24X2_IP(vo, align_dst, pout); + } + AE_SA64POS_FP(align_dst, pout); + + /* Remainder loop */ + #pragma no_unroll + for(i=remainder_start; i < N; i++){ + p_out[i] = p_inp1[i] & p_inp2[i]; + } + + return 0; +} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c new file mode 100644 index 00000000000..3b407522110 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c @@ -0,0 +1,525 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_remainder_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_remainder_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_DIV_S(a1, a2); + a = FIFLOOR_S(a); + a = XT_MUL_S(a, a2); + a = XT_SUB_S(a1, a); + XT_SSI(a, (xtfloat *)out, 0); + } + + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_remainder_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + /* For computing inp2 - inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x2, x1); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x2, x1); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(b0, a0); + c0 = FIFLOOR_S(c0); + c0 = XT_MUL_S(c0, a0); + c0 = XT_SUB_S(b0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + /* For computing inp1 - inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(a0, b0); + c0 = FIFLOOR_S(c0); + c0 = XT_MUL_S(c0, b0); + c0 = XT_SUB_S(a0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} + +static void internal_elm_remainder_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + /* For computing inp2 - inp1 */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_remainder_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_remainder_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_remainder_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_remainder_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c new file mode 100644 index 00000000000..2372fcadcdd --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c @@ -0,0 +1,2029 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_greater_lesser_equal_f32xf32_f32, + ( + WORD8 *y, + const FLOAT32 *x1, + const FLOAT32 *x2, + WORD32 N, + WORD32 kernel_type + ) + ) +#else +WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + WORD32 kernel_type) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + //xtfloatx2 *out = (xtfloatx2 *)p_out; + UWORD8 *out = p_out; + xtfloatx2 x1, x2, y; + xtbool check; + + xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); + + if(kernel_type == 0) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a2, a1); + + check = 0; + if(a <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a2, a1); + + check = 0; + if(a < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a1, a2); + + check = 0; + if(a <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a1, a2); + + check = 0; + if(a < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *out++ = store1; + + uint8_t store0 = val & 0x1; + *out++ = store0; + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + //a = XT_SUB_S(a2, a1); + + check = 0; + if(a1 == a2) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *out++ = AE_MOVAD32_H(store); + *out++ = AE_MOVAD32_L(store); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *out++ = AE_MOVAD32_H(store); + *out++ = AE_MOVAD32_L(store); + } + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + + a = XT_SUB_S(a2, a1); + + check = 0; + if(a != 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *out++ = store; + } + } + + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag, + WORD32 kernel_type) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + + xtbool check; + + xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + /* For computing inp2 - inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc]; + + if(kernel_type == 0) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + //c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(a0 == b0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 != 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + } + } + /* For computing inp1 - inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc]; + + if(kernel_type == 0) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if (kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x1, x2); + xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(a0, b0); + + check = 0; + + if(c0 < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + uint8_t val = AE_MOVAB2(check); + + uint8_t store1 = (val >> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + //c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(a0 == b0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + else + { + ae_valign vinp1, vinp2; + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + + //y = XT_SUB_SX2(x2, x1); + xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + + ae_int32x2 store = AE_ZERO32(); + AE_MOVF32X2(store, ones, check); + + *p_c++ = AE_MOVAD32_H(store); + *p_c++ = AE_MOVAD32_L(store); + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_SUB_S(b0, a0); + + check = 0; + + if(c0 != 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + } + } +} + +static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag, + WORD32 kernel_type) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + + xtbool check; + + UWORD8 * p_c = p_out; + xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + /* For computing inp2 - inp1 */ + if(sign_flag){ + if(kernel_type == 0) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out == 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 5) + { + ae_int32x2 ones = AE_MOVDA32(1); + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 1) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 2) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out <= 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 3) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(a0_7, x2); + + check = 0; + + if(out < 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + else if(kernel_type == 4) + { + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + else + { + ae_valign inp1_a, out_a; + inp1_a = XT_LASX2PP(p_a); + + for(i=0; i> 1) & 0x1; + *p_c++ = store1; + + uint8_t store0 = val & 0x1; + *p_c++ = store0; + } + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); + out = XT_SUB_S(x2, a0_7); + + check = 0; + + if(out == 0) + check = 1; + + uint8_t store = AE_MOVAB(check); + *p_c++ = store; + } + } + } +} +#endif + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_greaterequal_broadcast_4D_f32xf32_f32, + ( + WORD8 * p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 kernel_type + ) + ) +#else +WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(WORD8 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 kernel_type) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1); + + /* Check shapes */ + int i; + xtbool sign_flag; + for(i = 0; i < 4; i++) + { + if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) || + (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + UWORD8 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag, + kernel_type); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag, + kernel_type); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_greater_lesser_equal_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag, + kernel_type); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_greater_lesser_equal_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag, + kernel_type); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif From 16eea3fdf9d511cd68808663f625f7eb83c6e9ea Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Wed, 14 Aug 2024 02:57:15 -0700 Subject: [PATCH 2/6] Adding operators NNLIB interated operators --- backends/cadence/aot/functions_hifi.yaml | 172 ++++- backends/cadence/hifi/kernels/CMakeLists.txt | 1 + backends/cadence/hifi/kernels/kernels.h | 18 + .../cadence/hifi/operators/CMakeLists.txt | 8 +- backends/cadence/hifi/operators/op_where.cpp | 163 +++++ .../nnlib/xa_nn_elm_where_f32xf32_f32.c | 603 ++++++++++++++++++ 6 files changed, 962 insertions(+), 3 deletions(-) create mode 100644 backends/cadence/hifi/operators/op_where.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 492456ce985..03a14580243 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -51,6 +51,61 @@ kernels: - arg_meta: null kernel_name: torch::executor::div_out_mode + +- op: floor_divide.out + kernels: + - arg_meta: null + kernel_name: torch::executor::floor_divide_out + +- op: remainder.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::remainder_Tensor_out + +- op: remainder.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::remainder_Scalar_out + +- op: fmod.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::fmod_Tensor_out + +- op: fmod.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::fmod_Scalar_out + +- op: bitwise_and.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_and_Scalar_out + +- op: bitwise_and.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_and_Tensor_out + +- op: bitwise_or.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_or_Scalar_out + +- op: bitwise_or.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_or_Tensor_out + +- op: bitwise_xor.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_xor_Scalar_out + +- op: bitwise_xor.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::bitwise_xor_Tensor_out - op: embedding.out kernels: @@ -67,6 +122,11 @@ - arg_meta: null kernel_name: torch::executor::mul_out +- op: mul.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::mul_scalar_out + - op: permute_copy.out kernels: - arg_meta: null @@ -102,10 +162,105 @@ - arg_meta: null kernel_name: torch::executor::where_out +- op: scalar_tensor.out + kernels: + - arg_meta: null + kernel_name: torch::executor::scalar_tensor_out + - op: rsqrt.out kernels: - arg_meta: null - kernel_name: torch::executor::rsqrt_out + kernel_name: torch::executor::rsqrt_out + +- op: ge.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ge_scalar_out + +- op: ge.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ge_tensor_out + +- op: gt.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::gt_scalar_out + +- op: gt.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::gt_tensor_out + +- op: le.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::le_scalar_out + +- op: le.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::le_tensor_out + +- op: lt.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::lt_scalar_out + +- op: lt.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::lt_tensor_out + +- op: eq.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::eq_scalar_out + +- op: eq.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::eq_tensor_out + +- op: ne.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ne_scalar_out + +- op: ne.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::ne_tensor_out + +- op: pow.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::pow_Scalar_out + +- op: pow.Tensor_Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::pow_Tensor_Scalar_out + +- op: pow.Tensor_Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::pow_Tensor_Tensor_out + +- op: atan2.out + kernels: + - arg_meta: null + kernel_name: torch::executor::atan2_out + +- op: empty.out + kernels: + - arg_meta: null + kernel_name: torch::executor::empty_out + +- op: gelu.out + kernels: + - arg_meta: null + kernel_name: torch::executor::gelu_out # custom ops - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) @@ -130,4 +285,19 @@ kernels: - arg_meta: null kernel_name: impl::HiFi::quantized_linear_out + +- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_matmul_out + +- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_relu_out + +- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv_out diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index df2f9db72fb..08392cbe64d 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -18,6 +18,7 @@ add_library( ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_floor_div_broadcast_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c ) target_include_directories( diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 8119bcf8a67..e67fa60c3ec 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -15,6 +15,24 @@ /* For NNLIB APIs */ #include "xa_nnlib_kernels_api.h" +extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char *__restrict__ p_condition, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + const unsigned char *__restrict__ p_condition, + const WORD32 *const p_condition_shape + ); + extern "C" WORD32 xa_nn_elm_logicalxor_boolxbool_bool( WORD8 * __restrict__ p_out, const WORD8 * __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 553c98c03ab..4e8a44197b8 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -33,7 +33,7 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_floor_divide.cpp" @@ -52,6 +52,8 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_and.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_or.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_xor.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_scalar_tensor.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp" @@ -61,7 +63,9 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp") + "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp") add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp new file mode 100644 index 00000000000..8ab534d22a2 --- /dev/null +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include "kernels.h" + +namespace torch { +namespace executor { +namespace native { + +#define NNLIB_MAX_DIM 4 + +Tensor& where_out( + RuntimeContext& ctx, + const Tensor& cond, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ScalarType cond_type = cond.scalar_type(); + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, cond, out) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "where.self_out"; + + ET_CHECK_MSG( + cond_type == ScalarType::Bool || cond_type == ScalarType::Byte, + "Unhandled dtype %s for where.self_out", + torch::executor::toString(cond_type)); + + /*logic to find broadcast*/ + const int a_is_broadcasted = !out.sizes().equals(a.sizes()); + const int b_is_broadcasted = !out.sizes().equals(b.sizes()); + const int cond_is_broadcasted = !out.sizes().equals(cond.sizes()); + const int broadcast = (a_is_broadcasted || b_is_broadcasted || cond_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = cond.dim() > max_dim ? cond.dim() : max_dim; + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + bool check = 0; + + for(int i = 0; i < max_dim; i++) + { + if(cond.size(i) > b.size(i)) + check = 1; + + if(check == 1) + break; + } + + bool fall_back = 0; + if((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + fall_back = 1; + + if((broadcast == 1) && (max_dim > NNLIB_MAX_DIM)) + fall_back = 1; + + if((!fall_back) && (!check)) + { + const float* a_data = a.const_data_ptr(); + const float* b_data = b.const_data_ptr(); + float* out_data = out.mutable_data_ptr(); + const unsigned char* con = cond.const_data_ptr(); + + if(broadcast == 1) + { + int out_shape[NNLIB_MAX_DIM]; + int inp1_shape[NNLIB_MAX_DIM]; + int inp2_shape[NNLIB_MAX_DIM]; + int con_shape[NNLIB_MAX_DIM]; + + for(int i = 0; i < NNLIB_MAX_DIM; i++) + { + con_shape[i] = 1; + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = NNLIB_MAX_DIM - out.dim(); + int off_a = NNLIB_MAX_DIM - a.dim(); + int off_b = NNLIB_MAX_DIM - b.dim(); + int off_c = NNLIB_MAX_DIM - cond.dim(); + + for(int i = 0; i < out.dim(); i++) + out_shape[i+off_o] = out.size(i); + + for(int i = 0; i < a.dim(); i++) + inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + inp2_shape[i+off_b] = b.size(i); + for(int i = 0; i < cond.dim(); i++) + con_shape[i+off_c] = cond.size(i); + + /* Add fallback if broadcast and condition dimension are larger than inputs dimension, this code doesn't support that*/ + + if(con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) + { + void* p_scratch = malloc(out_shape[0]*out_shape[1]*out_shape[2]*out_shape[3]); + const unsigned char *p_brd_cond = (const unsigned char*)p_scratch; + xa_nn_broadcast_8_8((WORD8* __restrict__) p_brd_cond, out_shape, (const WORD8* __restrict__) con, con_shape, 4); + + for(int i = 0; i < 4; i++) + { + con_shape[i] = out_shape[i]; + } + xa_nn_elm_where_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, + b_data, inp2_shape, p_brd_cond, con_shape); + free(p_scratch); + } + else + { + xa_nn_elm_where_broadcast_4D_f32xf32_f32(out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape, con, con_shape); + } + } + else + { + xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel()); + } + } + else + { + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_OUT = + typename torch::executor::promote_types::type; + apply_ternary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b, const uint8_t val_c) { + CTYPE_OUT a_casted = static_cast(val_a); + CTYPE_OUT b_casted = static_cast(val_b); + return val_c ? a_casted : b_casted; + }, + a, + b, + cond, + out); + }); + }); + } + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c new file mode 100644 index 00000000000..3b20312cd8c --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c @@ -0,0 +1,603 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#include "xa_type_def.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h" +#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_where_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + const unsigned char *__restrict__ condition, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_where_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char *__restrict__ p_condition, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + unsigned char *condition = p_condition; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + con1 = XT_L8UI(condition, 0); + xtbool s = AE_MOVBA(con1); + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + XT_MOVT_S(a, a1, s); + XT_MOVF_S(a, a2, s); + XT_SSI(a, (xtfloat *)out, 0); + } +} + +static void internal_elm_where_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char * __restrict__ p_condition, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + unsigned char *condition = p_condition; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + /* For out = condition ? inp2 :inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + condition = &p_condition[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x2, con); + XT_MOVF_SX2 (y, x1, con); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x2, con); + XT_MOVF_SX2 (y, x1, con); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + con1 = XT_L8UI(condition, 0); + xtbool s = AE_MOVBA(con1); + XT_MOVT_S(c0, b0, s); + XT_MOVF_S(c0, a0, s); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + /* For out = condition ? inp1 :inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + condition = &p_condition[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + con1 = XT_L8UI(condition, 0); + xtbool s = AE_MOVBA(con1); + XT_MOVT_S(c0, a0, s); + XT_MOVF_S(c0, b0, s); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} +static void internal_elm_where_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char * __restrict__ p_condition, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + unsigned char *condition = p_condition; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + /* For out = condition ? inp1 :inp1 */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + + int itr0, itr1, itr2; + FLOAT32 *p_out_tmp = p_out; + const unsigned char *__restrict p_condition_temp = p_condition; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_where_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + p_condition, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_where_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + p_condition_temp, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + p_condition_temp += in_lc * out_lc; + } + + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_where_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_condition_temp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_where_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_condition_temp, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + p_condition_temp += p_out_shape[3]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif \ No newline at end of file From 6cf1a219229166c94c792ba6f62cfeaf1da196b4 Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Sat, 17 Aug 2024 22:18:00 -0700 Subject: [PATCH 3/6] Adding concat32_32 --- backends/cadence/hifi/operators/op_cat.cpp | 73 +++++----------------- 1 file changed, 17 insertions(+), 56 deletions(-) diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp index 79e6129bf18..52c3d5a1a78 100644 --- a/backends/cadence/hifi/operators/op_cat.cpp +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -9,10 +9,9 @@ #include #include -#include +#include #include "kernels.h" -#include "stdio.h" namespace torch { namespace executor { @@ -26,9 +25,9 @@ Tensor& cat_out( int64_t dim, Tensor& out) { - if(out.scalar_type()== ScalarType::Float){ + if(out.scalar_type() == ScalarType::Float){ WORD32 num_inp = tensors.size(); - WORD32 num_inp_dims = tensors[0].dim(); + WORD32 num_inp_dims = out.dim(); WORD32 num_out_dims = num_inp_dims; WORD32 axis = dim; @@ -38,72 +37,35 @@ Tensor& cat_out( WORD32 *ptr_shape[16]; const WORD32 *ptr[16]; + int k = 0; + int count = 0; for(int i = 0; i < num_inp; i++) { - ptr[i] = (const WORD32 *)tensors[i].const_data_ptr(); + if(tensors[i].numel() == 0) + continue; + ptr[k] = (const WORD32 *)tensors[i].const_data_ptr(); for(int j = 0; j < num_inp_dims; j++) { - inp_shape[i][j] = tensors[i].size(j); - if(j == axis) - p_out_shape[j] += inp_shape[i][j]; - else - p_out_shape[j] = inp_shape[i][j]; + inp_shape[k][j] = tensors[i].size(j); } - - ptr_shape[i] = inp_shape[i]; + ptr_shape[k] = inp_shape[k]; + k++; } - const WORD32 **pp_inps = &ptr[0]; - - WORD32 * p_out = (WORD32 *)out.mutable_data_ptr(); - - const WORD32 *const *pp_inps_shape = (const WORD32 *const *)&ptr_shape[0]; - - WORD32 val = xa_nn_concat_32_32(p_out - ,p_out_shape - ,pp_inps - ,pp_inps_shape - ,num_out_dims - ,num_inp - ,num_inp_dims - ,axis); + num_inp = k; - return out; - } - else if(out.scalar_type() == ScalarType::Char){ - WORD32 num_inp = tensors.size(); - WORD32 num_inp_dims = tensors[0].dim(); - WORD32 num_out_dims = num_inp_dims; - WORD32 axis = dim; - - WORD32 inp_shape[16][16]; - WORD32 p_out_shape[16] = {0}; - - WORD32 *ptr_shape[16]; - const WORD8 *ptr[16]; - - for(int i = 0; i < num_inp; i++) + for(int i = 0; i < num_out_dims; i++) { - ptr[i] = (const WORD8 *)tensors[i].const_data_ptr(); - for(int j = 0; j < num_inp_dims; j++) - { - inp_shape[i][j] = tensors[i].size(j); - if(j == axis) - p_out_shape[j] += inp_shape[i][j]; - else - p_out_shape[j] = inp_shape[i][j]; - } - - ptr_shape[i] = inp_shape[i]; + p_out_shape[i] = out.size(i); } - const WORD8 **pp_inps = &ptr[0]; + const WORD32 **pp_inps = &ptr[0]; - WORD8 * p_out = (WORD8 *)out.mutable_data_ptr(); + WORD32 * p_out = (WORD32 *)out.mutable_data_ptr(); const WORD32 *const *pp_inps_shape = (const WORD32 *const *)&ptr_shape[0]; - WORD32 val = xa_nn_concat_8_8(p_out + WORD32 val = xa_nn_concat_32_32(p_out ,p_out_shape ,pp_inps ,pp_inps_shape @@ -172,7 +134,6 @@ Tensor& cat_out( } return out; -#endif } } // namespace native From 50d47e5b701d076b7334f541d18a62de54f72f9e Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Sat, 17 Aug 2024 23:11:49 -0700 Subject: [PATCH 4/6] Adding concat32_32 --- .../hifi/third-party/nnlib/xa_nn_concat_32.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c index ca80d4ceee5..244f404d2ea 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c @@ -48,6 +48,7 @@ WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out { XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1); } + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1); concat_size += pp_inps_shape[i][axis]; } @@ -87,7 +88,7 @@ WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out ae_f32 *pae_out = (ae_f32 *)output_ptr; #pragma concurrent #pragma no_simd - for(int ic = 0; ic < (copy_size >> 1); ic++) + for(int ic = 0; ic < copy_size; ic++) { *pae_out++ = *pae_inp++; } @@ -113,9 +114,9 @@ WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out const ae_f32 *puae_inp = (const ae_f32 *)pae_inp; ae_f32 *puae_out = (ae_f32 *)pae_out; #pragma concurrent - for(int ic = 0; ic < ((copy_size >> 1) & 3); ic++) + for(int ic = 0; ic < (copy_size & 1); ic++) { - puae_out[ic] = puae_inp[ic]; + puae_out[copy_size - 1] = puae_inp[copy_size - 1]; } input_ptr += copy_size; output_ptr += concat_size * base_inner_size; @@ -147,19 +148,18 @@ WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out inp_a = AE_LA64_PP(pae_inp); out_a = AE_ZALIGN64(); - int copy_size_by6 = AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(copy_size, 0x2AAAAAAB))); - int copy_size_rem_start = 6*copy_size_by6; #pragma concurrent - for(int ic = 0; ic < copy_size_by6; ic++) + for(int ic = 0; ic < copy_size >> 1; ic++) { ae_int32x2 d0; AE_LA32X2_IP(d0, inp_a, pae_inp); AE_SA32X2_IP(d0, out_a, pae_out); } AE_SA64POS_FP(out_a, pae_out); - for(int ic = copy_size_rem_start; ic < copy_size; ic++) + + for(int ic = 0; ic < (copy_size & 1); ic++) { - output_ptr[ic] = input_ptr[ic]; + output_ptr[copy_size - 1] = input_ptr[copy_size - 1]; } input_ptr += copy_size; output_ptr += concat_size * base_inner_size; From abde562bd62040df2e000bbcfca3799d2e50be59 Mon Sep 17 00:00:00 2001 From: Rushi-cad Date: Sun, 18 Aug 2024 04:19:36 -0700 Subject: [PATCH 5/6] Code cleanup --- backends/cadence/hifi/operators/op_cat.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp index 52c3d5a1a78..39304ce46bd 100644 --- a/backends/cadence/hifi/operators/op_cat.cpp +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -38,7 +38,6 @@ Tensor& cat_out( const WORD32 *ptr[16]; int k = 0; - int count = 0; for(int i = 0; i < num_inp; i++) { if(tensors[i].numel() == 0) From d9d58f6fa15ab29d227f8c01c01a41df80ae8fa2 Mon Sep 17 00:00:00 2001 From: dijopaul <87994875+dijopaul@users.noreply.github.com> Date: Mon, 19 Aug 2024 18:02:36 +0530 Subject: [PATCH 6/6] Delete backends/cadence/aot/functions.yaml --- backends/cadence/aot/functions.yaml | 156 ---------------------------- 1 file changed, 156 deletions(-) delete mode 100644 backends/cadence/aot/functions.yaml diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml deleted file mode 100644 index 99937b4d495..00000000000 --- a/backends/cadence/aot/functions.yaml +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This yaml file contains operators that are also defined by the ATen library. -# For lean mode: -# - Codegen'd target `executorch_generated_lib` will be reading all the information -# from this file, including operator schema and kernel metadata. -# - Selective build target `codegen:executorch_defined_ops` now is selecting all the -# operators in this file, by dumping all the op names into `selected_operators.yaml`. -# -# See the README.md file in executorch/kernels/portable for a description of the syntax used -# by this file. - - -# aten ops -- op: _to_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::to_copy_out - -- op: _softmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::softmax_out - -- op: add.out - kernels: - - arg_meta: null - kernel_name: torch::executor::add_out - -- op: bmm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::bmm_out - -- op: cat.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cat_out - -- op: clone.out - kernels: - - arg_meta: null - kernel_name: torch::executor::clone_out - -- op: div.out - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out - -- op: div.out_mode - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out_mode - -- op: embedding.out - kernels: - - arg_meta: null - kernel_name: torch::executor::embedding_out - -- op: full.out - kernels: - - arg_meta: null - kernel_name: torch::executor::full_out - -- op: mul.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mul_out - -- op: mul.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::mul_scalar_out - -- op: mean.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mean_dim_out - -- op: permute_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::permute_copy_out - -- op: sigmoid.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sigmoid_out - -- op: slice_copy.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::slice_copy_Tensor_out - -- op: split_with_sizes_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::split_with_sizes_copy_out - -- op: sub.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sub_out - -- op: view_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::view_copy_out - -- op: where.self_out - kernels: - - arg_meta: null - kernel_name: torch::executor::where_out - -- op: rsqrt.out - kernels: - - arg_meta: null - kernel_name: torch::executor::rsqrt_out - -# custom ops -- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: impl::reference::quantize_per_tensor_out - -- func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: impl::reference::dequantize_per_tensor_out - -- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_conv_out - -- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_layer_norm_out - -- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_linear_out - -- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_relu_out - -- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!) - kernels: - - arg_meta: null - kernel_name: impl::reference::quantized_matmul_out