From 3b9dce3917b240c94ad7e7ec7156adcca8349e9c Mon Sep 17 00:00:00 2001 From: Nishak Date: Wed, 13 Nov 2024 22:17:17 -0800 Subject: [PATCH 1/3] Adding cat, full, permute_copy and relu ops --- backends/cadence/aot/functions_hifi.yaml | 12 +- backends/cadence/hifi/kernels/CMakeLists.txt | 2 + backends/cadence/hifi/kernels/kernels.h | 19 ++ .../cadence/hifi/operators/CMakeLists.txt | 8 +- backends/cadence/hifi/operators/op_cat.cpp | 156 +++++++++++ backends/cadence/hifi/operators/op_full.cpp | 101 +++++++ .../hifi/operators/op_permute_copy.cpp | 197 +++++++++++++ .../hifi/operators/quantized_relu_out.cpp | 96 +++++++ .../hifi/third-party/nnlib/xa_nn_concat_32.c | 172 ++++++++++++ .../third-party/nnlib/xa_nn_transpose_32.c | 260 ++++++++++++++++++ 10 files changed, 1015 insertions(+), 8 deletions(-) create mode 100644 backends/cadence/hifi/operators/op_cat.cpp create mode 100644 backends/cadence/hifi/operators/op_full.cpp create mode 100644 backends/cadence/hifi/operators/op_permute_copy.cpp create mode 100644 backends/cadence/hifi/operators/quantized_relu_out.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index bd1102ab0b8..ed4f6a2c1e9 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -35,7 +35,7 @@ - op: cat.out kernels: - arg_meta: null - kernel_name: torch::executor::cat_out + kernel_name: impl::HiFi::cat_out - op: clone.out kernels: @@ -60,7 +60,7 @@ - op: full.out kernels: - arg_meta: null - kernel_name: torch::executor::full_out + kernel_name: impl::HiFi::full_out - op: maximum.out kernels: @@ -85,7 +85,7 @@ - op: permute_copy.out kernels: - arg_meta: null - kernel_name: torch::executor::permute_copy_out + kernel_name: impl::HiFi::permute_copy_out - op: pow.Scalar_out kernels: @@ -155,7 +155,6 @@ - arg_meta: null kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out - - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null @@ -165,3 +164,8 @@ kernels: - arg_meta: null kernel_name: cadence::impl::HiFi::quantized_linear_out + +- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::quantized_relu_out \ No newline at end of file diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 3d321443f8b..549371255d9 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -10,6 +10,7 @@ add_library( kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c @@ -18,6 +19,7 @@ add_library( ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c ) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 7233fe6c29f..9a4689c17c2 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -23,6 +23,16 @@ extern "C" WORD32 xa_nn_broadcast_32_32( const int* const in_shape, int num_dims); +extern "C" WORD32 xa_nn_concat_32_32( + WORD32* __restrict__ p_out, + const WORD32* const p_out_shape, + const WORD32** pp_inps, + const WORD32* const* pp_inps_shape, + WORD32 num_out_dims, + WORD32 num_inp, + WORD32 num_inp_dims, + WORD32 axis); + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -125,6 +135,15 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32( WORD32 num_axis_dims, void* __restrict__ p_scratch_in); +extern "C" WORD32 xa_nn_transpose_32_32( + WORD32* __restrict__ p_out, + const WORD32* const p_out_shape, + const WORD32* __restrict__ p_inp, + const WORD32* const p_inp_shape, + const WORD32* __restrict__ p_permute_vec, + WORD32 num_out_dims, + WORD32 num_inp_dims); + namespace cadence { namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 6d21c4b49a7..c01dad5ce80 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -21,11 +21,14 @@ endif() # ATen compliant ops that are needed to run this model. set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" @@ -33,11 +36,8 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" @@ -71,7 +71,7 @@ target_include_directories( # Custom ops that are needed to run the test model. add_library( custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" + "quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp" ) target_include_directories( custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp new file mode 100644 index 00000000000..7e0031efd5e --- /dev/null +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::getLeadingDims; +using executorch::runtime::getTrailingDims; +using executorch::runtime::resize_tensor; +using executorch::runtime::tensors_have_same_dim_order; +using torch::executor::check_cat_args; +using torch::executor::Error; +using torch::executor::get_cat_out_target_size; + +namespace impl { +namespace HiFi { +namespace native { + +Tensor& cat_out( + RuntimeContext& ctx, + exec_aten::ArrayRef tensors, + int64_t dim, + Tensor& out) { + constexpr auto name = "cat.out"; + constexpr int kNnlibMaxDim = 16; + + bool optimized = true; + + if (out.scalar_type() != ScalarType::Float) + optimized = false; + + if (optimized) { + WORD32 num_inp = tensors.size(); + WORD32 num_inp_dims = out.dim(); + WORD32 num_out_dims = num_inp_dims; + WORD32 axis = dim; + + WORD32 inp_shape[kNnlibMaxDim][kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + + WORD32* ptr_shape[kNnlibMaxDim]; + const WORD32* ptr[kNnlibMaxDim]; + + int k = 0; + for (int i = 0; i < num_inp; i++) { + if (tensors[i].numel() == 0) + continue; + ptr[k] = (const WORD32*)tensors[i].const_data_ptr(); + for (int j = 0; j < num_inp_dims; j++) { + inp_shape[k][j] = tensors[i].size(j); + } + ptr_shape[k] = inp_shape[k]; + k++; + } + + num_inp = k; + + for (int i = 0; i < num_out_dims; i++) { + p_out_shape[i] = out.size(i); + } + + const WORD32** pp_inps = &ptr[0]; + + WORD32* p_out = (WORD32*)out.mutable_data_ptr(); + + const WORD32* const* pp_inps_shape = (const WORD32* const*)&ptr_shape[0]; + + WORD32 ret_val = xa_nn_concat_32_32( + p_out, + p_out_shape, + pp_inps, + pp_inps_shape, + num_out_dims, + num_inp, + num_inp_dims, + axis); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + return out; + } + + if (dim < 0) { + dim += out.dim(); + } + + ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out); + + Tensor::SizesType + expected_out_size[executorch::runtime::kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + // Special handling when all inputs are 1D-empty tensors for aten consistency + // In that case, just return an 1D-empty tensor without checking dim + bool all_1d_empty = true; + for (size_t i = 0; i < tensors.size(); ++i) { + if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { + all_1d_empty = false; + break; + } + } + if (all_1d_empty) { + return out; + } + + const size_t outer = getLeadingDims(out, dim); + const size_t dim_stride = getTrailingDims(out, dim); + const size_t ninputs = tensors.size(); + + const auto out_type = out.scalar_type(); + ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { + CTYPE_OUT* out_ptr = out.mutable_data_ptr(); + for (size_t i = 0; i < outer; ++i) { + for (size_t j = 0; j < ninputs; ++j) { + const auto in_type = tensors[j].scalar_type(); + ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&] { + if (tensors[j].numel() == 0) { + return; + } + size_t inner = tensors[j].size(dim) * dim_stride; + const CTYPE_IN* const in_ptr = + tensors[j].const_data_ptr() + i * inner; + + for (size_t k = 0; k < inner; ++k) { + out_ptr[k] = static_cast(in_ptr[k]); + } + out_ptr += inner; + }); + } + } + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp new file mode 100644 index 00000000000..0afb2152380 --- /dev/null +++ b/backends/cadence/hifi/operators/op_full.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +namespace impl { +namespace HiFi { +namespace native { + +using exec_aten::IntArrayRef; +using exec_aten::RuntimeContext; +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::Error; +using torch::executor::native::utils::extract_scalar; +using torch::executor::native::utils::get_scalar_dtype; + +Tensor& full_out( + RuntimeContext& ctx, + const IntArrayRef sizes, + const Scalar& fill_value, + Tensor& out) { + (void)ctx; + + ScalarType val_type = get_scalar_dtype(fill_value); + ScalarType out_type = out.scalar_type(); + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, sizes) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + constexpr auto name = "full.out"; + + bool optimized = 0; + if (out_type == ScalarType::Long || out_type == ScalarType::Float || + out_type == ScalarType::Byte || out_type == ScalarType::Char) + optimized = 1; + + if(out_type != val_type) + optimized = 0; + + if (optimized) { + if (out_type == ScalarType::Long) { + int* data_out = out.mutable_data_ptr(); + int val; + extract_scalar(fill_value, &val); + for (size_t i = 0; i < out.numel(); ++i) { + data_out[i] = val; + } + } else if (out_type == ScalarType::Float) { + float* data_out = out.mutable_data_ptr(); + float val; + extract_scalar(fill_value, &val); + + WORD32 ret_val = xa_nn_memset_f32_f32( + data_out, + val, + out.numel()); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + } else if (out_type == ScalarType::Byte || out_type == ScalarType::Char) { + char* data_out = out.mutable_data_ptr(); + int val; + extract_scalar(fill_value, &val); + memset((void*)data_out, val, out.numel()); + } + return out; + } + + ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, name, CTYPE_VAL, [&] { + CTYPE_VAL val; + extract_scalar(fill_value, &val); + + ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, name, CTYPE_OUT, [&] { + CTYPE_OUT val_casted = static_cast(val); + auto data_out = out.mutable_data_ptr(); + for (size_t i = 0; i < out.numel(); ++i) { + data_out[i] = val_casted; + } + }); + }); + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl \ No newline at end of file diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp new file mode 100644 index 00000000000..37b0aa8b058 --- /dev/null +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::SizesType; +using exec_aten::Tensor; +using executorch::runtime::IntArrayRef; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::kTensorDimensionLimit; +using executorch::runtime::resize_tensor; +using executorch::runtime::tensors_have_same_dim_order; +using torch::executor::check_permute_copy_args; +using torch::executor::Error; +using torch::executor::get_permute_copy_out_target_size; + +namespace impl { +namespace HiFi { +namespace native { + +namespace { + +void increment_coordinate_permuted( + const Tensor& tensor, + size_t* const coordinate, + IntArrayRef dims) { + for (int i = dims.size() - 1; i >= 0; i--) { + size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim(); + coordinate[d]++; + if (coordinate[d] == tensor.size(d)) { + coordinate[d] = 0; + } else { + return; + } + } +} + +} // namespace + +Tensor& permute_copy_out( + KernelRuntimeContext& ctx, + const Tensor& in, + IntArrayRef dims, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_permute_copy_out_target_size( + in, dims, expected_out_size, &expected_out_dim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + const auto in_type = out.scalar_type(); + + constexpr auto name = "permute_copy.out"; + constexpr int kNnlibMaxDim = 16; + + bool optimized = false; + + if (out.scalar_type() == ScalarType::Float) + optimized = true; + else if (out.scalar_type() == ScalarType::Char) + optimized = true; + else if (out.scalar_type() == ScalarType::Byte) + optimized = true; + + if (in.dim() > kNnlibMaxDim) + optimized = false; + + if (optimized) { + if (in_type == ScalarType::Float) { + WORD32* p_inp = (WORD32*)in.const_data_ptr(); + WORD32* p_out = (WORD32*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + WORD32 val = xa_nn_transpose_32_32( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + return out; + } else if (in_type == ScalarType::Char) { + WORD8* p_inp = (WORD8*)in.const_data_ptr(); + WORD8* p_out = (WORD8*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + WORD32 val = xa_nn_transpose_8_8( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + ET_KERNEL_CHECK(ctx, val == 0, Internal, out); + + } else if (in_type == ScalarType::Byte) { + WORD8* p_inp = (WORD8*)in.const_data_ptr(); + WORD8* p_out = (WORD8*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + WORD32 val = xa_nn_transpose_8_8( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + ET_KERNEL_CHECK(ctx, val == 0, Internal, out); + } + return out; + } + + size_t in_coord[kTensorDimensionLimit] = {0}; + size_t trailing_dims_memo[kTensorDimensionLimit]; + executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo); + + // in and out must be the same dtype + ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + for (size_t i = 0; i < out.numel(); ++i) { + out_data[i] = + in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo( + in, in_coord, trailing_dims_memo)]; + increment_coordinate_permuted(in, in_coord, dims); + } + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp new file mode 100644 index 00000000000..3ddf18c7411 --- /dev/null +++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using Tensor = exec_aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = exec_aten::ScalarType; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +template +void quantized_relu_( + const Tensor& input, + const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + T q_zero_point = in_zero_point.const_data_ptr()[0]; + const T* __restrict__ in = input.const_data_ptr(); + T* __restrict__ out = output.mutable_data_ptr(); + + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + + // Compute the out_scale from out_multiplier and out_shift + const float out_scale = + -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]); + + for (size_t i = 0, e = input.numel(); i < e; ++i) { + const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0; + out[i] = kernels::quantize(temp, out_scale, out_zero_point); + } +} + +void quantized_relu_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + if (input.scalar_type() == executorch::aten::ScalarType::Byte) { + const uint8_t *p_in = input.const_data_ptr(); + uint8_t *p_out =output.mutable_data_ptr(); + uint8_t q_zero_point = in_zero_point.const_data_ptr()[0]; + + WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u( p_out, + p_in, + (int)q_zero_point, + (int)255, + input.numel()); + ret_val = 5; + + ET_CHECK_MSG( + ret_val == 0, + "An internal error occured"); + } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { + const int8_t *p_in = input.const_data_ptr(); + int8_t *p_out = output.mutable_data_ptr(); + int8_t q_zero_point = in_zero_point.const_data_ptr()[0]; + + WORD32 ret_val = xa_nn_vec_activation_min_max_8_8( p_out, + p_in, + (int)q_zero_point, + (int)128, + input.numel()); + + ET_CHECK_MSG( + ret_val == 0, + "An internal error occured"); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(input.scalar_type())); + } +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c new file mode 100644 index 00000000000..244f404d2ea --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c @@ -0,0 +1,172 @@ +#include "xa_type_def.h" +#include "xa_nn_common.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_macros.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_common.h" + +WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out + ,const WORD32 *const p_out_shape + ,const WORD32 **pp_inps + ,const WORD32 *const *pp_inps_shape + ,WORD32 num_out_dims + ,WORD32 num_inp + ,WORD32 num_inp_dims + ,WORD32 axis) +{ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1); + //Validate Arguments + XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1); + XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1); + XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1); + XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1); + + int i = 0, j = 0; + for(i = 0; i < num_out_dims; i++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1); + } + + if(axis < 0) + axis = num_out_dims + axis; + + WORD32 concat_size = 0; + for (i = 0; i < num_inp; i++) + { + XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1); +#pragma loop_count min=1 + for(j = 0; j < num_out_dims; j++) + { + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1); + } + + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1); + concat_size += pp_inps_shape[i][axis]; + } + + XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1); + + //Calculate outer and inner size for axis + WORD32 outer_size = 1; +#pragma no_simd + for(int i = 0; i < axis; i++) + { + outer_size *= p_out_shape[i]; + } + + WORD32 base_inner_size = 1; +#pragma no_simd + for(int i = axis + 1; i < num_out_dims; i++) + { + base_inner_size *= p_out_shape[i]; + } + + WORD32 *ptmp_out = p_out; + for(int i = 0; i < num_inp; i++) + { + const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size; + WORD32 *output_ptr = ptmp_out; + const WORD32* input_ptr = pp_inps[i]; + + if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0) + && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0)) + { + if(copy_size <= 8) + { + const ae_f32 *pae_inp = (const ae_f32 *)input_ptr; + for(int k = 0; k < outer_size; k++) + { + ae_f32 *pae_out = (ae_f32 *)output_ptr; +#pragma concurrent +#pragma no_simd + for(int ic = 0; ic < copy_size; ic++) + { + *pae_out++ = *pae_inp++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + for(int ic = 0; ic < (copy_size >> 1); ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + const ae_f32 *puae_inp = (const ae_f32 *)pae_inp; + ae_f32 *puae_out = (ae_f32 *)pae_out; +#pragma concurrent + for(int ic = 0; ic < (copy_size & 1); ic++) + { + puae_out[copy_size - 1] = puae_inp[copy_size - 1]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + else + { + if(copy_size <= 6) + { + for(int k = 0; k < outer_size; k++) + { +#pragma concurrent +#pragma no_unroll + for(int ic = 0; ic < copy_size; ic++) + { + output_ptr[ic] = *input_ptr++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + +#pragma concurrent + for(int ic = 0; ic < copy_size >> 1; ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + + for(int ic = 0; ic < (copy_size & 1); ic++) + { + output_ptr[copy_size - 1] = input_ptr[copy_size - 1]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + ptmp_out += copy_size; + } + return 0; +} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c new file mode 100644 index 00000000000..e7b80e3a1d9 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c @@ -0,0 +1,260 @@ +#include "xa_nnlib_common.h" +#include "stdio.h" +/* + * Currently only supports upto 5D input tensors. + * 1/2/3/4 D input tensors will be scaled up to 5D. + * For example, 2x3 -> 1x1x1x2x3. + */ + +WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out + ,const WORD32 *const p_out_shape + ,const WORD32 * __restrict__ p_inp + ,const WORD32 *const p_inp_shape + ,const WORD32 * __restrict__ p_permute_vec + ,WORD32 num_out_dims + ,WORD32 num_inp_dims) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp, -1); + XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); + + /* Invalid input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1); + XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1); + + int itr = 0; + for(itr=0; itr < num_inp_dims; itr++) + { + XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1); + } + for(itr=0; itr < num_out_dims; itr++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1); + } + + + /* Output shape provided must be correct based on input + * shape and permute values */ + for(itr=0; itr < num_out_dims; itr++) + { + int output_dim = p_out_shape[itr]; + int expected_dim = p_inp_shape[p_permute_vec[itr]]; + XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1); + } + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); + + /* Shift all dim with 1 in the outer part */ + int eff_output_shape[5]; + int eff_permute_vec[5]; + + for(int i = 0; i < num_out_dims; i++) + { + eff_output_shape[i] = p_out_shape[i]; + eff_permute_vec[i] = p_permute_vec[i]; + } + + int one_i=num_out_dims-1, non_one_i=num_out_dims-1; + while(one_i > 0 && non_one_i >=0){ + while(one_i > 0 && eff_output_shape[one_i]!=1){ + one_i--; + } + non_one_i = one_i; + while(non_one_i >= 0 && eff_output_shape[non_one_i]==1) + { + non_one_i--; + } + if(one_i > 0 && non_one_i >=0){ + int temp; + /*swap output_shape*/ + { + temp = eff_output_shape[one_i]; + eff_output_shape[one_i] = eff_output_shape[non_one_i]; + eff_output_shape[non_one_i] = temp; + } + /*swap permute_vec*/ + { + temp = eff_permute_vec[one_i]; + eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; + eff_permute_vec[non_one_i] = temp; + } + + } + } + + /* Promoting lesser dim tensors to 5D tensors. + * Also updating the permute_vec and shapes as needed for optimization */ + int p_5D_inp_shape[5] = {1, 1, 1, 1, 1}; + int p_5D_out_shape[5] = {1, 1, 1, 1, 1}; + int p_5D_permute_vec[5] = {0, 1, 2, 3, 4}; + + /* Check if any inner inp dimension is same in the output */ + int last_dim_same = 1, last_n_same_dim = 0; + itr = num_inp_dims - 1; + while(itr >= 0) + { + last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; + last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; + itr--; + } + + int dims_added = 5 - num_inp_dims; + itr = num_inp_dims - 1; + int same_count = last_n_same_dim; + int count = 4; + while(itr >= 0) + { + p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr]; + p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr]; + same_count--; + itr--; + count = (same_count > 0) ? count : count - 1; + } + + itr = num_inp_dims - 1; + same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; + count = 4; + while(itr >= 0) + { + p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; + same_count--; + itr--; + count--; + } + + int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; + int inp_dim1, inp_dim2, inp_dim3, inp_dim4; + int inp_stride[5]; + + out_dim0 = p_5D_out_shape[0]; + out_dim1 = p_5D_out_shape[1]; + out_dim2 = p_5D_out_shape[2]; + out_dim3 = p_5D_out_shape[3]; + out_dim4 = p_5D_out_shape[4]; + + inp_dim1 = p_5D_inp_shape[1]; + inp_dim2 = p_5D_inp_shape[2]; + inp_dim3 = p_5D_inp_shape[3]; + inp_dim4 = p_5D_inp_shape[4]; + + inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4; + inp_stride[1] = inp_dim2*inp_dim3*inp_dim4; + inp_stride[2] = inp_dim3*inp_dim4; + inp_stride[3] = inp_dim4; + inp_stride[4] = 1; + + if(last_n_same_dim) + { + int itr0, itr1, itr2, itr3, itr4; + WORD32 *p_inp0 = (WORD32 *)p_inp; + for(itr0 = 0; itr0 < out_dim0; itr0++) + { + WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); +#pragma loop_count min=1 + for(itr1 = 0; itr1 < out_dim1; itr1++) + { + WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); +#pragma loop_count min=1 + for(itr2 = 0; itr2 < out_dim2; itr2++) + { + WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); +#pragma loop_count min=1 + for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) + { + WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); + if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0)) + { + ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); + ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); + ae_int32x2 d0; + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) + { + AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32)); + AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32)); + } + ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); + ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); +#pragma loop_count max=3 + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + puae_o[itr4] = puae_i[itr4]; + } + } + else + { + ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); + ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); + ae_valign a_inp = AE_LA64_PP(pae_i); + ae_valign a_out = AE_ZALIGN64(); + ae_int32x2 d0; + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) + { + AE_LA32X2_IP(d0, a_inp, pae_i); + AE_SA32X2_IP(d0, a_out, pae_o); + } + AE_SA64POS_FP(a_out, pae_o); + ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); + ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); +#pragma loop_count max=3 + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + puae_o[itr4] = puae_i[itr4]; + } + } + } + } + } + } + } + else + { + int itr0, itr1, itr2, itr3, itr4; + WORD32 *p_inp0 = (WORD32 *)p_inp; + for(itr0 = 0; itr0 < out_dim0; itr0++) + { + WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); + for(itr1 = 0; itr1 < out_dim1; itr1++) + { + WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); + for(itr2 = 0; itr2 < out_dim2; itr2++) + { + WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); + for(itr3 = 0; itr3 < out_dim3; itr3++) + { + WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); + + ae_valign a_out = AE_ZALIGN64(); + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) + { + ae_int32x2 d0, d1; + ae_int32x2 tmp0; + + AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); + AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); + + tmp0 = AE_SEL32_HH(d0, d1); + + AE_SA32X2_IP(tmp0, a_out, (ae_int32x2 *)p_out); + } + AE_SA64POS_FP(a_out, p_out); +#pragma loop_count max=3 + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + *p_out++ = *p_inp4; + } + } + } + } + } + } + + return 0; +} \ No newline at end of file From 495c1525293bed746be05b523c3fe2e46cc2bc3f Mon Sep 17 00:00:00 2001 From: Nishak Date: Thu, 14 Nov 2024 06:37:41 -0800 Subject: [PATCH 2/3] clean up --- backends/cadence/aot/functions_hifi.yaml | 8 ++-- backends/cadence/hifi/operators/op_cat.cpp | 2 + backends/cadence/hifi/operators/op_full.cpp | 19 +++++----- .../hifi/operators/op_permute_copy.cpp | 15 ++++---- .../hifi/operators/quantized_relu_out.cpp | 38 +++++++------------ 5 files changed, 37 insertions(+), 45 deletions(-) diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index ed4f6a2c1e9..0f3e582884c 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -35,7 +35,7 @@ - op: cat.out kernels: - arg_meta: null - kernel_name: impl::HiFi::cat_out + kernel_name: cadence::impl::HiFi::cat_out - op: clone.out kernels: @@ -60,7 +60,7 @@ - op: full.out kernels: - arg_meta: null - kernel_name: impl::HiFi::full_out + kernel_name: cadence::impl::HiFi::full_out - op: maximum.out kernels: @@ -70,7 +70,7 @@ - op: mean.out kernels: - arg_meta: null - kernel_name: cadence::impl::HiFi::mean_dim_out + kernel_name: cadence::impl::HiFi::mean_dim_out - op: minimum.out kernels: @@ -85,7 +85,7 @@ - op: permute_copy.out kernels: - arg_meta: null - kernel_name: impl::HiFi::permute_copy_out + kernel_name: cadence::impl::HiFi::permute_copy_out - op: pow.Scalar_out kernels: diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp index 7e0031efd5e..1a628924457 100644 --- a/backends/cadence/hifi/operators/op_cat.cpp +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -23,6 +23,7 @@ using torch::executor::check_cat_args; using torch::executor::Error; using torch::executor::get_cat_out_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -154,3 +155,4 @@ Tensor& cat_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp index 0afb2152380..47804a64f45 100644 --- a/backends/cadence/hifi/operators/op_full.cpp +++ b/backends/cadence/hifi/operators/op_full.cpp @@ -11,6 +11,7 @@ #include #include +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -44,13 +45,13 @@ Tensor& full_out( constexpr auto name = "full.out"; - bool optimized = 0; + bool optimized = false; if (out_type == ScalarType::Long || out_type == ScalarType::Float || out_type == ScalarType::Byte || out_type == ScalarType::Char) - optimized = 1; - - if(out_type != val_type) - optimized = 0; + optimized = true; + + if (out_type != val_type) + optimized = false; if (optimized) { if (out_type == ScalarType::Long) { @@ -65,10 +66,7 @@ Tensor& full_out( float val; extract_scalar(fill_value, &val); - WORD32 ret_val = xa_nn_memset_f32_f32( - data_out, - val, - out.numel()); + WORD32 ret_val = xa_nn_memset_f32_f32(data_out, val, out.numel()); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); @@ -98,4 +96,5 @@ Tensor& full_out( } // namespace native } // namespace HiFi -} // namespace impl \ No newline at end of file +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp index 37b0aa8b058..bb72eaf521a 100644 --- a/backends/cadence/hifi/operators/op_permute_copy.cpp +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -22,6 +22,7 @@ using torch::executor::check_permute_copy_args; using torch::executor::Error; using torch::executor::get_permute_copy_out_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -75,11 +76,9 @@ Tensor& permute_copy_out( bool optimized = false; - if (out.scalar_type() == ScalarType::Float) - optimized = true; - else if (out.scalar_type() == ScalarType::Char) - optimized = true; - else if (out.scalar_type() == ScalarType::Byte) + if (out.scalar_type() == ScalarType::Float || + out.scalar_type() == ScalarType::Char || + out.scalar_type() == ScalarType::Byte) optimized = true; if (in.dim() > kNnlibMaxDim) @@ -103,7 +102,7 @@ Tensor& permute_copy_out( p_permute_vec[i] = dims[i]; } - WORD32 val = xa_nn_transpose_32_32( + WORD32 ret_val = xa_nn_transpose_32_32( p_out, p_out_shape, p_inp, @@ -112,7 +111,8 @@ Tensor& permute_copy_out( num_out_dims, num_inp_dims); - return out; + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } else if (in_type == ScalarType::Char) { WORD8* p_inp = (WORD8*)in.const_data_ptr(); WORD8* p_out = (WORD8*)out.mutable_data_ptr(); @@ -195,3 +195,4 @@ Tensor& permute_copy_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp index 3ddf18c7411..031a472cced 100644 --- a/backends/cadence/hifi/operators/quantized_relu_out.cpp +++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp @@ -53,35 +53,25 @@ void quantized_relu_out( const Tensor& out_multiplier, const Tensor& out_shift, Tensor& output) { - if (input.scalar_type() == executorch::aten::ScalarType::Byte) { - const uint8_t *p_in = input.const_data_ptr(); - uint8_t *p_out =output.mutable_data_ptr(); + if (input.scalar_type() == executorch::aten::ScalarType::Byte) { + const uint8_t* p_in = input.const_data_ptr(); + uint8_t* p_out = output.mutable_data_ptr(); uint8_t q_zero_point = in_zero_point.const_data_ptr()[0]; - - WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u( p_out, - p_in, - (int)q_zero_point, - (int)255, - input.numel()); - ret_val = 5; - ET_CHECK_MSG( - ret_val == 0, - "An internal error occured"); + WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u( + p_out, p_in, (int)q_zero_point, (int)255, input.numel()); + + ET_CHECK_MSG(ret_val == 0, "An internal error occured"); } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { - const int8_t *p_in = input.const_data_ptr(); - int8_t *p_out = output.mutable_data_ptr(); + const int8_t* p_in = input.const_data_ptr(); + int8_t* p_out = output.mutable_data_ptr(); int8_t q_zero_point = in_zero_point.const_data_ptr()[0]; - - WORD32 ret_val = xa_nn_vec_activation_min_max_8_8( p_out, - p_in, - (int)q_zero_point, - (int)128, - input.numel()); - ET_CHECK_MSG( - ret_val == 0, - "An internal error occured"); + WORD32 ret_val = xa_nn_vec_activation_min_max_8_8( + p_out, p_in, (int)q_zero_point, (int)128, input.numel()); + + ET_CHECK_MSG(ret_val == 0, "An internal error occured"); + } else { ET_CHECK_MSG( false, From 0c6112aa57d25d884022aa9aa6983ba4aadb9aab Mon Sep 17 00:00:00 2001 From: Nishak Date: Thu, 14 Nov 2024 09:38:44 -0800 Subject: [PATCH 3/3] changing relu op --- .../hifi/operators/quantized_relu_out.cpp | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp index 031a472cced..98f130df8c7 100644 --- a/backends/cadence/hifi/operators/quantized_relu_out.cpp +++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp @@ -58,17 +58,34 @@ void quantized_relu_out( uint8_t* p_out = output.mutable_data_ptr(); uint8_t q_zero_point = in_zero_point.const_data_ptr()[0]; - WORD32 ret_val = xa_nn_vec_activation_min_max_asym8u_asym8u( - p_out, p_in, (int)q_zero_point, (int)255, input.numel()); + WORD32 ret_val = xa_nn_vec_relu_asym8u_asym8u( + p_out, + p_in, + (int)q_zero_point, + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + (int)out_zero_point, + 0, + 255, + input.numel()); ET_CHECK_MSG(ret_val == 0, "An internal error occured"); + } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { const int8_t* p_in = input.const_data_ptr(); int8_t* p_out = output.mutable_data_ptr(); int8_t q_zero_point = in_zero_point.const_data_ptr()[0]; - WORD32 ret_val = xa_nn_vec_activation_min_max_8_8( - p_out, p_in, (int)q_zero_point, (int)128, input.numel()); + WORD32 ret_val = xa_nn_vec_relu_asym8s_asym8s( + p_out, + p_in, + (int)q_zero_point, + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + (int)out_zero_point, + -128, + 127, + input.numel()); ET_CHECK_MSG(ret_val == 0, "An internal error occured");