Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
- op: cat.out
kernels:
- arg_meta: null
kernel_name: torch::executor::cat_out
kernel_name: cadence::impl::HiFi::cat_out

- op: clone.out
kernels:
Expand All @@ -60,7 +60,7 @@
- op: full.out
kernels:
- arg_meta: null
kernel_name: torch::executor::full_out
kernel_name: cadence::impl::HiFi::full_out

- op: maximum.out
kernels:
Expand All @@ -70,7 +70,7 @@
- op: mean.out
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::mean_dim_out
kernel_name: cadence::impl::HiFi::mean_dim_out

- op: minimum.out
kernels:
Expand All @@ -85,7 +85,7 @@
- op: permute_copy.out
kernels:
- arg_meta: null
kernel_name: torch::executor::permute_copy_out
kernel_name: cadence::impl::HiFi::permute_copy_out

- op: pow.Scalar_out
kernels:
Expand Down Expand Up @@ -155,7 +155,6 @@
- arg_meta: null
kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out


- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
Expand All @@ -165,3 +164,8 @@
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_linear_out

- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_relu_out
2 changes: 2 additions & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ add_library(
kernels.cpp
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
Expand All @@ -18,6 +19,7 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
)
# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
Expand Down
19 changes: 19 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ extern "C" WORD32 xa_nn_broadcast_32_32(
const int* const in_shape,
int num_dims);

extern "C" WORD32 xa_nn_concat_32_32(
WORD32* __restrict__ p_out,
const WORD32* const p_out_shape,
const WORD32** pp_inps,
const WORD32* const* pp_inps_shape,
WORD32 num_out_dims,
WORD32 num_inp,
WORD32 num_inp_dims,
WORD32 axis);

extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
Expand Down Expand Up @@ -125,6 +135,15 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
WORD32 num_axis_dims,
void* __restrict__ p_scratch_in);

extern "C" WORD32 xa_nn_transpose_32_32(
WORD32* __restrict__ p_out,
const WORD32* const p_out_shape,
const WORD32* __restrict__ p_inp,
const WORD32* const p_inp_shape,
const WORD32* __restrict__ p_permute_vec,
WORD32 num_out_dims,
WORD32 num_inp_dims);

namespace cadence {
namespace impl {
namespace HiFi {
Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,23 @@ endif()
# ATen compliant ops that are needed to run this model.
set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
Expand Down Expand Up @@ -71,7 +71,7 @@ target_include_directories(
# Custom ops that are needed to run the test model.
add_library(
custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
"quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
"quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp"
)
target_include_directories(
custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
Expand Down
158 changes: 158 additions & 0 deletions backends/cadence/hifi/operators/op_cat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <cstring>

#include <executorch/backends/cadence/hifi/kernels/kernels.h>

using exec_aten::ScalarType;
using exec_aten::Tensor;
using executorch::aten::RuntimeContext;
using executorch::runtime::getLeadingDims;
using executorch::runtime::getTrailingDims;
using executorch::runtime::resize_tensor;
using executorch::runtime::tensors_have_same_dim_order;
using torch::executor::check_cat_args;
using torch::executor::Error;
using torch::executor::get_cat_out_target_size;

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

Tensor& cat_out(
RuntimeContext& ctx,
exec_aten::ArrayRef<Tensor> tensors,
int64_t dim,
Tensor& out) {
constexpr auto name = "cat.out";
constexpr int kNnlibMaxDim = 16;

bool optimized = true;

if (out.scalar_type() != ScalarType::Float)
optimized = false;

if (optimized) {
WORD32 num_inp = tensors.size();
WORD32 num_inp_dims = out.dim();
WORD32 num_out_dims = num_inp_dims;
WORD32 axis = dim;

WORD32 inp_shape[kNnlibMaxDim][kNnlibMaxDim];
WORD32 p_out_shape[kNnlibMaxDim];

WORD32* ptr_shape[kNnlibMaxDim];
const WORD32* ptr[kNnlibMaxDim];

int k = 0;
for (int i = 0; i < num_inp; i++) {
if (tensors[i].numel() == 0)
continue;
ptr[k] = (const WORD32*)tensors[i].const_data_ptr<float>();
for (int j = 0; j < num_inp_dims; j++) {
inp_shape[k][j] = tensors[i].size(j);
}
ptr_shape[k] = inp_shape[k];
k++;
}

num_inp = k;

for (int i = 0; i < num_out_dims; i++) {
p_out_shape[i] = out.size(i);
}

const WORD32** pp_inps = &ptr[0];

WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();

const WORD32* const* pp_inps_shape = (const WORD32* const*)&ptr_shape[0];

WORD32 ret_val = xa_nn_concat_32_32(
p_out,
p_out_shape,
pp_inps,
pp_inps_shape,
num_out_dims,
num_inp,
num_inp_dims,
axis);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);

return out;
}

if (dim < 0) {
dim += out.dim();
}

ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out);

Tensor::SizesType
expected_out_size[executorch::runtime::kTensorDimensionLimit];
size_t expected_out_dim = 0;
get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);

ET_KERNEL_CHECK(
ctx,
resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
InvalidArgument,
out);

// Special handling when all inputs are 1D-empty tensors for aten consistency
// In that case, just return an 1D-empty tensor without checking dim
bool all_1d_empty = true;
for (size_t i = 0; i < tensors.size(); ++i) {
if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
all_1d_empty = false;
break;
}
}
if (all_1d_empty) {
return out;
}

const size_t outer = getLeadingDims(out, dim);
const size_t dim_stride = getTrailingDims(out, dim);
const size_t ninputs = tensors.size();

const auto out_type = out.scalar_type();
ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
for (size_t i = 0; i < outer; ++i) {
for (size_t j = 0; j < ninputs; ++j) {
const auto in_type = tensors[j].scalar_type();
ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
if (tensors[j].numel() == 0) {
return;
}
size_t inner = tensors[j].size(dim) * dim_stride;
const CTYPE_IN* const in_ptr =
tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;

for (size_t k = 0; k < inner; ++k) {
out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
}
out_ptr += inner;
});
}
}
});

return out;
}

} // namespace native
} // namespace HiFi
} // namespace impl
} // namespace cadence
Loading