diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 582bf178bff..c04dd1fafd3 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -125,6 +125,11 @@ - arg_meta: null kernel_name: impl::HiFi::dequantize_per_tensor_out +- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv_out - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 90cd814e1e5..db2143d0c93 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -9,6 +9,9 @@ add_library( cadence_kernels kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_matXvec_asym8xasym8_asym8_circ.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c @@ -28,6 +31,7 @@ target_include_directories( ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/ + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/kernels/cnn/hifi4/ ${_common_include_directories} ) diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 7601d969447..2396bc4b414 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -16,6 +16,32 @@ #include "xa_nnlib_kernels_api.h" /* Potential NNLIB function/APIs */ +extern "C" WORD32 xa_nn_conv2d_per_chan_asym8xasym8(UWORD8* __restrict__ p_out, + const UWORD8* __restrict__ p_inp, + const UWORD8* __restrict__ p_kernel, + const WORD32* __restrict__ p_bias, + WORD32 input_height, + WORD32 input_width, + WORD32 input_channels, + WORD32 kernel_height, + WORD32 kernel_width, + WORD32 kernel_channels, + WORD32 dilation_height, + WORD32 dilation_width, + WORD32 out_channels, + WORD32 x_stride, + WORD32 y_stride, + WORD32 x_padding, + WORD32 y_padding, + WORD32 out_height, + WORD32 out_width, + WORD32 input_zero_bias, + WORD32 * p_out_multiplier, + WORD32 * p_out_shift, + WORD32 out_zero_bias, + WORD32 out_data_format, + VOID *p_scratch); + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, const WORD32 *const p_out_shape, const FLOAT32 * __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 0bd117771f9..2bcf4321d48 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -66,6 +66,7 @@ target_include_directories( add_library( custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" + "quantized_conv_out.cpp" ) target_include_directories( custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} diff --git a/backends/cadence/hifi/operators/quantized_conv_out.cpp b/backends/cadence/hifi/operators/quantized_conv_out.cpp new file mode 100644 index 00000000000..b40becd1e40 --- /dev/null +++ b/backends/cadence/hifi/operators/quantized_conv_out.cpp @@ -0,0 +1,632 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "kernels.h" + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x))+(bytes-1))&(~(bytes-1))) + +namespace impl { +namespace HiFi { +namespace native { + +using Tensor = exec_aten::Tensor; +using RuntimeContext = torch::executor::RuntimeContext; +using ScalarType = exec_aten::ScalarType; + + +// This implements a generic 2d conv kernel that operates on raw pointers. +// The version handles both quantized and fp32 convolutions. +// The input is of shape [n x c x h x w] +// The weight is of shape [oc x wc x wh x ww], where wc == c +// The output is of shape [n x oc x oh x ow] +// The bias is of shape [oc] +template +__attribute__((noinline)) void conv2d_nchw_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t c, + int32_t h, + int32_t w, + int32_t oc, + int32_t wc, + int32_t wh, + int32_t ww, + int32_t oh, + int32_t ow, + // Stride + int16_t s0, + int16_t s1, + // Padding + int16_t p0, + int16_t p1, + // Dilation + int16_t d0, + int16_t d1, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + const int32_t* __restrict__ weight_zero_point = nullptr, + const float* __restrict__ bias_scale = nullptr, + float out_scale = 1, + OT out_zero_point = 0, + bool per_tensor_quantized = true) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * c * h * w; + OT* out_batch = p_out + _n * oc * oh * ow; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + OT* out_plane = out_batch + _oc * oh * ow; + const WT* weight_batch = p_weight + _oc * wc * wh * ww; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of size + // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an + // output channel of size 1 x oh x ow. + for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) { + for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) { + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to the + // output channel being computed) with the corresponding weight + // channel. + // If the padding is 0, and dilation is 1, then we can remove the + // unnecessary checks, and simplify the code so that it can be + // vectorized by Tensilica compiler. + if (zero_pad_unit_dilation) { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + int ioff = (_h + _wh) * w + (_w + _ww); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + + (quantized ? 0 : 0); + /*float rhs = weight_plane[woff] - + (quantized ? weight_zero_point[0] : 0);*/ + + acc += lhs * rhs; + } + } + } + } else { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_h + d0 * _wh - p0) >= 0) && + ((_h + d0 * _wh - p0) < h) && + ((_w + d1 * _ww - p1) >= 0) && + ((_w + d1 * _ww - p1) < w)) { + //((_w + d1 * _ww - p1 < w))) { + + int ioff = + (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + + (quantized ? 0 : 0); + /*float rhs = weight_plane[woff] - + (quantized ? weight_zero_point[0] : 0);*/ + + acc += lhs * rhs; + } + } + } + } + } + if (quantized) { + float val = + (per_tensor_quantized ? bias_scale[0] : bias_scale[_oc]) * + acc; + out_plane[_oh * ow + _ow] = + kernels::quantize(val, inv_out_scale, out_zero_point); + } else { + out_plane[_oh * ow + _ow] = acc; + } + } + } + } + } + } +} + +// The quantized convolution kernel. in_scale and weight_scale are implicit in +// bias_scale, since it is a product of the two. The kernel will branch to +// quantized::conv1d or quantized::conv2d based on the dimensionality of +// activation tensor. +void quantized_conv_out( + RuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + exec_aten::IntArrayRef stride, + exec_aten::IntArrayRef padding, + exec_aten::IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + bool channel_last, + Tensor& out) { + bool conv1d = input.dim() == 3; + + if(input.scalar_type() == ScalarType::Char) + { + WORD8* __restrict__ p_out = (WORD8* __restrict__ )out.mutable_data_ptr(); + WORD8* __restrict__ p_inp = (WORD8* __restrict__ )input.const_data_ptr(); + WORD8* __restrict__ p_kernel = (WORD8* __restrict__ )weight.const_data_ptr(); + WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr(); + + WORD32 input_height = conv1d ? 1 : input.size(2); + WORD32 input_width = conv1d ? input.size(2) : input.size(3); + WORD32 input_channels = input.size(1); + WORD32 kernel_height = conv1d ? 1 : weight.size(2); + WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + WORD32 kernel_channels = weight.size(1); + WORD32 out_channels = weight.size(0); + WORD32 out_height = conv1d ? 1 : out.size(2); + WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 batches = input.size(0); + + WORD32 x_stride = stride[1]; + WORD32 y_stride = stride[0]; + WORD32 x_padding = padding[1]; + WORD32 y_padding = padding[0]; + WORD32 dilation_width = 1; + WORD32 dilation_height = 1; + + WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr(); + + WORD32 input_zero_bias = -in_zero_point; + WORD32 kernel_zero_bias = -kernel_bias_ptr[0]; + + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + + float out_scale = 1. / output_scale; + + for(int i = 0; i < out_channels; i++) + { + out_multiplier32[i] = bias_scale.const_data_ptr()[0] * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 inp_precision = 8; + WORD32 kernel_precision = 8; + pVOID p_scratch = nullptr; + WORD32 *ptr_scratch; + + WORD32 scratch_size = 0; + + WORD32 out_data_format = 1; + + WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8)); + WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8)); + + WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8); + WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_height; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims = 4; + WORD32 num_inp_dims = 4; + + WORD32 t = xa_nn_transpose_8_8(pin + ,p_out_shape + ,p_inp + ,p_inp_shape + ,p_permute_vec + ,num_out_dims + ,num_inp_dims); + + WORD32 p_inp_shape1[4]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[4]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + WORD32 p_permute_vec1[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims1 = 4; + WORD32 num_inp_dims1 = 4; + + WORD32 t1 = xa_nn_transpose_8_8(pkernel + ,p_out_shape1 + ,p_kernel + ,p_inp_shape1 + ,p_permute_vec1 + ,num_out_dims1 + ,num_inp_dims1); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size=scratch_size<0?0:scratch_size; + + ptr_scratch = (WORD32 *)malloc(scratch_size + 16); + + p_scratch = (xa_codec_handle_t)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; ++_n) { + WORD8 *in_batch = pin + _n * input_channels * input_height * input_width; + WORD8 *out_batch = p_out + _n * out_channels * out_height * out_width; + + WORD32 val = xa_nn_conv2d_per_chan_sym8sxasym8s + (out_batch + ,in_batch + ,pkernel + ,p_bias + ,input_height + ,input_width + ,input_channels + ,kernel_height + ,kernel_width + ,kernel_channels + ,dilation_height + ,dilation_width + ,out_channels + ,x_stride + ,y_stride + ,x_padding + ,y_padding + ,out_height + ,out_width + ,input_zero_bias + ,out_multiplier32 + ,out_shift32 + ,out_zero_bias + ,out_data_format + ,p_scratch + ); + } + + free(ptr1); + free(ptr2); + free(ptr_scratch); + } + else if(input.scalar_type() == ScalarType::Byte) + { + UWORD8* __restrict__ p_out = (UWORD8* __restrict__ )out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = (UWORD8* __restrict__ )input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = (UWORD8* __restrict__ )weight.const_data_ptr(); + WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr(); + + WORD32 input_height = conv1d ? 1 : input.size(2); + WORD32 input_width = conv1d ? input.size(2) : input.size(3); + WORD32 input_channels = input.size(1); + WORD32 kernel_height = conv1d ? 1 : weight.size(2); + WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3); + WORD32 kernel_channels = weight.size(1); + WORD32 out_channels = weight.size(0); + WORD32 out_height = conv1d ? 1 : out.size(2); + WORD32 out_width = conv1d ? out.size(2) : out.size(3); + WORD32 batches = input.size(0); + + WORD32 x_stride = stride[1]; + WORD32 y_stride = stride[0]; + WORD32 x_padding = padding[1]; + WORD32 y_padding = padding[0]; + WORD32 dilation_width = 1; + WORD32 dilation_height = 1; + + WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr(); + + WORD32 input_zero_bias = -in_zero_point; + WORD32 kernel_zero_bias = -kernel_bias_ptr[0]; + + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + + float out_scale = 1. / output_scale; + + for(int i = 0; i < out_channels; i++) + { + out_multiplier32[i] = bias_scale.const_data_ptr()[0] * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 inp_precision = -3; + WORD32 kernel_precision = -3; + pVOID p_scratch = nullptr; + WORD32 *ptr_scratch; + + WORD32 scratch_size = 0; + + WORD32 out_data_format = 1; + + WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8)); + WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8)); + + WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8); + WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_height; + p_inp_shape[3] = input_width; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = input_height; + p_out_shape[2] = input_width; + p_out_shape[3] = input_channels; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims = 4; + WORD32 num_inp_dims = 4; + + WORD8 * p_tmp = (WORD8 *)p_inp; + + WORD32 t = xa_nn_transpose_8_8(pin + ,p_out_shape + ,p_tmp + ,p_inp_shape + ,p_permute_vec + ,num_out_dims + ,num_inp_dims); + + WORD32 p_inp_shape1[4]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_height; + p_inp_shape1[3] = kernel_width; + + WORD32 p_out_shape1[4]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_height; + p_out_shape1[2] = kernel_width; + p_out_shape1[3] = kernel_channels; + + WORD32 p_permute_vec1[4] = {0, 2, 3, 1}; + + WORD32 num_out_dims1 = 4; + WORD32 num_inp_dims1 = 4; + + WORD8 * p_tmp1 = (WORD8 *)p_kernel; + + WORD32 t1 = xa_nn_transpose_8_8(pkernel + ,p_out_shape1 + ,p_tmp1 + ,p_inp_shape1 + ,p_permute_vec1 + ,num_out_dims1 + ,num_inp_dims1); + + scratch_size = xa_nn_conv2d_getsize( + input_height, + input_width, + input_channels, + kernel_height, + kernel_width, + kernel_channels, + dilation_height, + dilation_width, + y_stride, + y_padding, + x_stride, + x_padding, + out_height, + out_width, + out_channels, + inp_precision, + kernel_precision, + out_data_format); + + scratch_size=scratch_size<0?0:(scratch_size); + + ptr_scratch = (WORD32 *)malloc(scratch_size); + + p_scratch = (pVOID )ALIGN_PTR(ptr_scratch, 8); + + const UWORD8* __restrict__ p_inp1 = (const UWORD8* __restrict__ )pin; + const UWORD8* __restrict__ p_kernel1 = (const UWORD8* __restrict__ )pkernel; + + for (int _n = 0; _n < batches; _n++) { + const UWORD8* __restrict__ in_batch = p_inp1 + _n * input_channels * input_height * input_width; + UWORD8* __restrict__ out_batch = p_out + _n * out_channels * out_height * out_width; + + xa_nn_conv2d_per_chan_asym8xasym8 + (out_batch + ,in_batch + ,p_kernel1 + ,p_bias + ,input_height + ,input_width + ,input_channels + ,kernel_height + ,kernel_width + ,kernel_channels + ,dilation_height + ,dilation_width + ,out_channels + ,x_stride + ,y_stride + ,x_padding + ,y_padding + ,out_height + ,out_width + ,input_zero_bias + ,out_multiplier32 + ,out_shift32 + ,out_zero_bias + ,out_data_format + ,p_scratch + ); + } + + free(ptr1); + free(ptr2); + free(ptr_scratch); + } + else + { + // input = [n, c, h, w] + const int n = input.size(0); + const int c = input.size(1); + const int h = conv1d ? 1 : input.size(2); + const int w = conv1d ? input.size(2) : input.size(3); + // weight = [oc, wc, wh, ww] + const int oc = weight.size(0); + const int wc = weight.size(1); + const int wh = conv1d ? 1 : weight.size(2); + const int ww = conv1d ? weight.size(2) : weight.size(3); + // output = [n, oc, oh, ow] + const int oh = conv1d ? 1 : out.size(2); + const int ow = conv1d ? out.size(2) : out.size(3); + + // Bool flag to check if weight tensor is quantized per-tensor or + // per-channel + bool per_tensor_quantized = bias_scale.numel() == 1; + + if(input.scalar_type() == ScalarType::Char) + { + conv2d_nchw_core_generic( + input.const_data_ptr(), + weight.const_data_ptr(), + bias.const_data_ptr(), + out.mutable_data_ptr(), + n, + c, + h, + w, + oc, + wc, + wh, + ww, + oh, + ow, + stride[0], + stride[1], + padding[0], + padding[1], + 1,//dilation[0], + 1,//dilation[1], + groups, + in_zero_point, + weight_zero_point.const_data_ptr(), + bias_scale.const_data_ptr(), + output_scale, + (int8_t)output_zero_point, + per_tensor_quantized); + + } + else if(input.scalar_type() == ScalarType::Byte) + { + conv2d_nchw_core_generic( + input.const_data_ptr(), + weight.const_data_ptr(), + bias.const_data_ptr(), + out.mutable_data_ptr(), + n, + c, + h, + w, + oc, + wc, + wh, + ww, + oh, + ow, + stride[0], + stride[1], + padding[0], + padding[1], + 1,//dilation[0], + 1,//dilation[1], + groups, + in_zero_point, + weight_zero_point.const_data_ptr(), + bias_scale.const_data_ptr(), + output_scale, + (uint8_t)output_zero_point, + per_tensor_quantized); + } + else + { + ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type()); + } + } + +} + +}; // namespace native +}; // namespace HiFi +}; // namespace impl diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c new file mode 100644 index 00000000000..111a90d8e10 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c @@ -0,0 +1,469 @@ +#include "xa_nnlib_common.h" +#include "xa_nnlib_common_macros.h" +#include "xa_nn_conv2d_std_state.h" + +static WORD32 conv_x_left_pad( + WORD32 x_padding, + WORD32 kernel_width, + WORD32 x_stride, + WORD32 out_width, + WORD32 out_height, + WORD32 out_channels, + WORD32 out_channels_offset, + WORD32 out_width_offset, + WORD32 out_height_offset, + const WORD32* __restrict__ p_bias, + WORD8 *p_out, + WORD32 * p_out_multiplier, + WORD32 * p_out_shift, + WORD32 out_zero_bias) +{ + WORD32 i,j,k; + WORD32 out_width_over_x_pad = (x_padding - kernel_width)/x_stride + 1; + WORD32 left_shift, right_shift; + out_width_over_x_pad = out_width_over_x_pad > out_width ? out_width : out_width_over_x_pad; + + ae_int32x2 max_int8 = AE_MOVDA32(255); + ae_int32x2 min_int8 = AE_MOVDA32(0); + + /* When kernel convolves over x-left pad region only, output is just bias */ + for(i = 0; i < out_height; i++) + { + for(j = 0; j < out_width_over_x_pad; j++) + { + for(k = 0; k < out_channels; k++) + { +#if TFLITE_SINGLE_ROUNDING + left_shift = p_out_shift[k]; + /* Single rounding macro doesn't need two shifts so this is not used */ + (void)right_shift; +#else /* #if TFLITE_SINGLE_ROUNDING */ + left_shift = p_out_shift[k] < 0 ? 0 : p_out_shift[k]; + right_shift = p_out_shift[k] > 0 ? 0 : -p_out_shift[k]; +#endif /* #if TFLITE_SINGLE_ROUNDING */ + ae_int32x2 acc; +#if XCHAL_HAVE_HIFI1 + if(p_bias != NULL){ + acc = AE_L32_I((ae_int32*)&p_bias[k], 0); + } + else{ + acc = AE_MOVDA32(0); + } + MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift); + acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias)); + acc = AE_MAX32(acc, min_int8); + acc = AE_MIN32(acc, max_int8); + AE_S8_0_X_HIFI1( AE_MOVINT16X4_FROMINT32X2(acc), (WORD8 *)p_out, (i * out_height_offset + j * out_width_offset + k * out_channels_offset)); +#else + if(p_bias != NULL){ + acc = AE_MOVDA32(p_bias[k]); + } + else{ + acc = AE_MOVDA32(0); + } + MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift); + acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias)); +#if 0 + AE_MINMAX32(acc, min_int8, max_int8); +#else + acc = AE_MAX32(acc, min_int8); + acc = AE_MIN32(acc, max_int8); +#endif + p_out[i * out_height_offset + j * out_width_offset + k * out_channels_offset] = (UWORD8)AE_MOVAD32_L(acc); +#endif + } + } + } + return out_width_over_x_pad; +} + +static WORD32 conv_x_right_pad( + WORD32 x_padding, + WORD32 input_width, + WORD32 x_stride, + WORD32 out_width, + WORD32 out_height, + WORD32 out_channels, + WORD32 out_channels_offset, + WORD32 out_width_offset, + WORD32 out_height_offset, + const WORD32* __restrict__ p_bias, + WORD8 *p_out, + WORD32 * p_out_multiplier, + WORD32 * p_out_shift, + WORD32 out_zero_bias) +{ + WORD32 i,j,k; + WORD32 idx_out_width_over_x_r_pad = (x_padding + input_width + x_stride - 1)/x_stride + 1; + WORD32 left_shift, right_shift; + WORD32 out_width_over_x_r_pad = out_width - idx_out_width_over_x_r_pad; + + ae_int32x2 max_int8 = AE_MOVDA32(255); + ae_int32x2 min_int8 = AE_MOVDA32(0); + + /* When kernel convolves over x-right pad region only, output is just bias */ + for(i = 0; i < out_height; i++) + { + for(j = idx_out_width_over_x_r_pad; j < out_width; j++) + { + for(k = 0; k < out_channels; k++) + { +#if TFLITE_SINGLE_ROUNDING + left_shift = p_out_shift[k]; + /* Single rounding macro doesn't need two shifts so this is not used */ + (void)right_shift; +#else /* #if TFLITE_SINGLE_ROUNDING */ + left_shift = p_out_shift[k] < 0 ? 0 : p_out_shift[k]; + right_shift = p_out_shift[k] > 0 ? 0 : -p_out_shift[k]; +#endif /* #if TFLITE_SINGLE_ROUNDING */ + ae_int32x2 acc; +#if XCHAL_HAVE_HIFI1 + if(p_bias != NULL){ + acc = AE_L32_I((ae_int32*)&p_bias[k], 0); + } + else{ + acc = AE_MOVDA32(0); + } + MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift); + acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias)); + acc = AE_MAX32(acc, min_int8); + acc = AE_MIN32(acc, max_int8); + AE_S8_0_X_HIFI1( AE_MOVINT16X4_FROMINT32X2(acc), (WORD8 *)p_out, (i * out_height_offset + j * out_width_offset + k * out_channels_offset)); +#else + if(p_bias != NULL){ + acc = AE_MOVDA32(p_bias[k]); + } + else{ + acc = AE_MOVDA32(0); + } + MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift); + acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias)); +#if 0 + AE_MINMAX32(acc, min_int8, max_int8); +#else + acc = AE_MAX32(acc, min_int8); + acc = AE_MIN32(acc, max_int8); +#endif + p_out[i * out_height_offset + j * out_width_offset + k * out_channels_offset] = (UWORD8)AE_MOVAD32_L(acc); +#endif + } + } + } + return out_width_over_x_r_pad; +} + +#ifdef polyphase_debug +#include +void writingoutput(WORD8* __restrict__ p_out_base, WORD32 out_height, WORD32 out_width,WORD32 out_channels ) +{ + int i,j, count; + FILE * dataFilePr; + count = 0; + dataFilePr = fopen("C:/Users/hariev/Documents/file.txt", "w+"); + for(i=0;i 0), -1); + XA_NNLIB_ARG_CHK_COND((out_zero_bias < 0 || out_zero_bias > 255), -1); + XA_NNLIB_ARG_CHK_COND((out_data_format != 0 && out_data_format != 1), -1); + XA_NNLIB_ARG_CHK_COND((dilation_height != 1), -1); + XA_NNLIB_ARG_CHK_COND((dilation_width != 1), -1); + + int itr; + for(itr=0;itr 31), -1); + } + + const int groups = input_channels/kernel_channels; + XA_NNLIB_ARG_CHK_COND((groups<=0), -1); + XA_NNLIB_ARG_CHK_COND(((input_channels %kernel_channels)!=0),-1); + XA_NNLIB_ARG_CHK_COND(((out_channels%groups)!=0),-1); + const int kernels_per_group = out_channels / groups; + XA_NNLIB_ARG_CHK_COND((kernels_per_group<=0),-1); + + int ret = 0; + + WORD32 j; + WORD32 input_bytewidth = 1; + VOID *pp_inp = (VOID *)p_inp; + UWORD8* __restrict__ tmp_out; + + p_scratch = ALIGNED_ADDR(p_scratch, ALIGNMENT); + xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_scratch; + WORD32 inp_h, inp_w, ker_h, ker_w, x_str, y_str, x_pad, y_pad, out_h, out_w; + + if ((input_height == 1) && (kernel_height == 1) && (out_height == 1)) + { + inp_h = input_width; + inp_w = input_height; + ker_h = kernel_width; + ker_w = kernel_height; + x_str = y_stride; + y_str = x_stride; + x_pad = y_padding; + y_pad = x_padding; + out_h = out_width; + out_w = out_height; + } + else + { + inp_h = input_height; + inp_w = input_width; + ker_h = kernel_height; + ker_w = kernel_width; + x_str = x_stride; + y_str = y_stride; + x_pad = x_padding; + y_pad = y_padding; + out_h = out_height; + out_w = out_width; + } + + WORD32 out_channels_offset = out_data_format ? out_h * out_w : 1; + WORD32 out_height_offset = out_data_format ? out_w : out_w * out_channels; + WORD32 out_width_offset = out_data_format ? 1 : out_channels; + + WORD32 x_padding_var = x_pad; + WORD32 kernel_channels_pad; + + kernel_channels_pad = PADDED_SIZE(kernel_channels, (ALIGNMENT >> 1)); + + /* When kernel convolves over x-left pad region only */ + WORD32 out_width_over_x_pad = 0; + + if(x_padding_var >= ker_w) + { + out_width_over_x_pad = conv_x_left_pad(x_pad, ker_w, x_str, out_w, out_h, out_channels, out_channels_offset, out_width_offset, out_height_offset, p_bias, p_out, p_out_multiplier, p_out_shift, out_zero_bias); + x_padding_var -= out_width_over_x_pad * x_str; + } + + /* When kernel convolves over x-right pad region only */ + WORD32 out_width_over_x_r_pad = 0; + // Determine x-right padding + WORD32 x_r_pad = ker_w + (out_w - 1) * x_str - (x_pad + inp_w); + x_r_pad = x_r_pad < 0 ? 0 : x_r_pad; + if(x_r_pad >= ker_w) + { + out_width_over_x_r_pad = conv_x_right_pad(x_pad, inp_w, x_str, out_w, out_h, out_channels, out_channels_offset, out_width_offset, out_height_offset, p_bias, p_out, p_out_multiplier, p_out_shift, out_zero_bias); + } + + /* When kernel convolves over input region */ + p_out += out_width_over_x_pad * out_width_offset; + // Initialize circular buffer + // Determine y-bottom padding + WORD32 y_b_pad = ker_h + (out_h - 1) * y_str - (y_pad + inp_h); + y_b_pad = y_b_pad < 0 ? 0 : y_b_pad; + + xa_nn_conv2d_std_init_state((void*)p_state + ,(void*)p_kernel + ,inp_h + ,input_channels + ,ker_h + ,kernel_width + ,y_str + ,y_pad + ,out_h + ,out_channels + ,PREC_ASYM8U + ,PREC_ASYM8U); + + for (int grp_i = 0; grp_i < groups; ++grp_i) + { + tmp_out=p_out+grp_i*kernels_per_group*out_channels_offset; + xa_nn_conv2d_group_init_state((void*)p_state + ,(void*)p_kernel + ,inp_h + ,kernel_channels + ,ker_h + ,ker_w + ,y_str + ,y_pad + ,out_h + ,out_channels + ,PREC_ASYM8U + ,PREC_ASYM8U); + + pp_inp = (VOID *)(p_inp+grp_i*kernel_channels); + + conv2d_group_init_cir_buf(input_channels, kernel_channels_pad,kernel_channels,input_bytewidth, inp_w, inp_h, y_pad, y_b_pad, x_padding_var, ker_w, x_str, (VOID**)&pp_inp, p_state, -input_zero_bias); + + // Index to padded input width + WORD32 idx_beg_inp_width_pad = ker_w - x_str; + idx_beg_inp_width_pad = idx_beg_inp_width_pad < 0 ? 0 : idx_beg_inp_width_pad; + + // Process Loop to compute one output plane [out_h x out_channels] per iteration + for(j=0; j < out_w-out_width_over_x_pad-out_width_over_x_r_pad; j++) + { + // Add x_str x (inp_h x input_channels) new planes to circular buffer + conv2d_group_update_cir_buf(input_channels, kernel_channels_pad,kernel_channels,input_bytewidth, inp_w, inp_h, y_pad, y_b_pad, x_padding_var, ker_w, x_str, (VOID**)&pp_inp, idx_beg_inp_width_pad, p_state, -input_zero_bias); + + // Update index to input width padded + idx_beg_inp_width_pad += x_str; + + const WORD32 *p_bias_grp = NULL; + if(p_bias != NULL){ + p_bias_grp = p_bias+grp_i*kernels_per_group; + } + + xa_nn_matXvec_asym8xasym8_asym8_circ + (tmp_out /* output */ + ,p_state->cir_buf.p_curr/* matrix: rows x cols */ + ,(p_state->p_kernel_padded+grp_i*kernels_per_group*kernel_channels_pad*ker_w*ker_h) /* vec: cols */ + ,p_bias_grp/* bias */ + ,out_h /* rows */ + ,kernel_channels_pad * ker_w * ker_h /* cols */ + ,kernel_channels_pad * ker_w * y_str/* row_offset */ + ,kernels_per_group /* vec_count */ + ,kernel_channels_pad * ker_w * ker_h /* vec_stride */ + ,out_channels_offset /* out_col_offset */ + ,out_height_offset /* out_row_offset */ + ,input_zero_bias + ,0 + ,p_out_multiplier[0] + ,p_out_shift[0] + ,out_zero_bias + ); + + tmp_out += out_width_offset; + } + } + + return 0; +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c new file mode 100644 index 00000000000..f6820f21cc3 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c @@ -0,0 +1,1580 @@ +#include "xa_nnlib_common.h" +#include "xa_nnlib_common_macros.h" +#include "xa_nn_conv2d_std_state.h" + +#ifndef ENABLE_SCRATCH_SIZE_API_ONLY +VOID xa_nn_conv2d_std_init_state( + VOID *p_scratch, + VOID *p_kernel, + WORD32 input_height, + WORD32 input_channels, + WORD32 kernel_height, + WORD32 kernel_width, + WORD32 y_stride, + WORD32 y_padding, + WORD32 out_height, + WORD32 output_channels, + WORD32 input_precision, + WORD32 kernel_precision) +{ + WORD8 *p_mem = (WORD8 *)p_scratch; + xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_mem; + size_t input_size; + UWORD32 align_size; + + switch(input_precision) + { + case 8: + input_size = sizeof(WORD8); + align_size = ALIGNMENT>>1; + break; + case -8: + case 16: + input_size = sizeof(WORD16); + align_size = ALIGNMENT>>1; + break; + case -1: + input_size = sizeof(WORD32); + align_size = ALIGNMENT>>2; + break; + case -3: + input_size = sizeof(UWORD8); + align_size = ALIGNMENT>>1; + break; + case -4: + case -5: + input_size = sizeof(WORD8); + align_size = ALIGNMENT>>1; + break; + default: + input_size = 0; + align_size = 0; + break; + } + p_mem += sizeof(xa_nn_conv_state_t); + p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT); + + if(((UWORD32)p_kernel & BUS_WIDTH_MASK) == ((UWORD32)p_mem & BUS_WIDTH_MASK)) + { + p_mem += BUS_WIDTH; /* Add a offset to avoid banking stall */ + } + + p_state->cir_buf.p_begin = p_mem; + p_state->cir_buf.p_curr = p_mem; + + // Computing circular buffer size + // Determine y-bottom padding + WORD32 y_b_pad = kernel_height + (out_height - 1) * y_stride - (y_padding + input_height); + y_b_pad = y_b_pad < 0 ? 0 : y_b_pad; + + WORD32 input_channels_pad; + +#if !ENABLE_PADDING_CONV2D_STD + if(input_precision == PREC_ASYM8S) + { + input_channels_pad = input_channels; + } + else +#endif + { +#if HW_AE_ADDCIRC16X4_XC + /* Disbale padding for ic=1 (worst case scenario for performance), if hardware support exists. + * Enabled only for conv2d_std_sym8sxasym8s variant */ + if(input_channels == 1 && kernel_precision == PREC_SYM8S && input_precision == PREC_ASYM8S) + { + input_channels_pad = 1; + } + else +#endif + { + input_channels_pad = PADDED_SIZE(input_channels, align_size); + } + } + + WORD32 cir_buf_size_bytes = (y_padding + input_height + y_b_pad) * kernel_width * input_channels_pad * input_size; + while(cir_buf_size_bytes%16 !=0) + { + cir_buf_size_bytes+= kernel_width*input_channels_pad*input_size; + } + + p_mem += cir_buf_size_bytes; + p_state->cir_buf.p_end = p_mem; + + AE_SETCBEGIN0(p_state->cir_buf.p_begin); + AE_SETCEND0(p_state->cir_buf.p_end); + + p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT); + + p_state->p_kernel_padded = (void *)p_kernel; + +#if !ENABLE_PADDING_CONV2D_STD + if( + (input_precision != PREC_ASYM8S) && + (input_precision != PREC_F32) && + (input_precision != PREC_16) && + (input_channels_pad != input_channels) + ) +#else + if( + (input_precision != PREC_16) && + (input_channels_pad != input_channels) + ) +#endif + { + int oc, kh, kw, kernel_size; + p_state->p_kernel_padded = (void *)p_mem; + + switch(kernel_precision) + { + case 8: + kernel_size = sizeof(WORD8); + break; + case 16: + kernel_size = sizeof(WORD16); + break; + case -1: + kernel_size = sizeof(WORD32); + break; + case -3: + kernel_size = sizeof(UWORD8); + break; + case -4: + case -5: + kernel_size = sizeof(WORD8); + break; + default: + kernel_size = 0; + break; + } + + memset(p_mem, 0, output_channels*kernel_height*kernel_width*input_channels_pad*kernel_size); + pWORD8 p_src = (pWORD8) p_kernel; + pWORD8 p_dst = (pWORD8) p_state->p_kernel_padded; + + for(oc = 0; oc < output_channels; oc++) + for(kh = 0; kh < kernel_height; kh++) + { + for(kw = 0; kw < kernel_width; kw++) + { +// memcpy(p_dst, p_src, kernel_size * input_channels); + for(int ii=0; ii>1; + break; + case -8: + case 16: + input_size = sizeof(WORD16); + align_size = ALIGNMENT>>1; + break; + case -1: + input_size = sizeof(WORD32); + align_size = ALIGNMENT>>2; + break; + case -3: + input_size = sizeof(UWORD8); + align_size = ALIGNMENT>>1; + break; + case -4: + case -5: + input_size = sizeof(WORD8); + align_size = ALIGNMENT>>1; + break; + default: + input_size = 0; + align_size = 0; + break; + } + p_mem += sizeof(xa_nn_conv_state_t); + p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT); + + if(((UWORD32)p_kernel & BUS_WIDTH_MASK) == ((UWORD32)p_mem & BUS_WIDTH_MASK)) + { + p_mem += BUS_WIDTH; /* Add a offset to avoid banking stall */ + } + + p_state->cir_buf.p_begin = p_mem; + p_state->cir_buf.p_curr = p_mem; + + // Computing circular buffer size + // Determine y-bottom padding + WORD32 y_b_pad = kernel_height + (out_height - 1) * y_stride - (y_padding + input_height); + y_b_pad = y_b_pad < 0 ? 0 : y_b_pad; + + WORD32 kernel_channels_pad; + +#if !ENABLE_PADDING_CONV2D_STD + if(input_precision == PREC_ASYM8S) + { + kernel_channels_pad = kernel_channels; + } + else +#endif + { +#if HW_AE_ADDCIRC16X4_XC + /* Disbale padding for ic=1 (worst case scenario for performance), if hardware support exists. + * Enabled only for conv2d_std_sym8sxasym8s variant */ + if(kernel_channels == 1 && kernel_precision == PREC_SYM8S && input_precision == PREC_ASYM8S) + { + kernel_channels_pad = 1; + } + else +#endif + { + kernel_channels_pad = PADDED_SIZE(kernel_channels, align_size); + } + } + + WORD32 cir_buf_size_bytes = (y_padding + input_height + y_b_pad) * kernel_width * kernel_channels_pad * input_size; + while(cir_buf_size_bytes%16 !=0) + { + cir_buf_size_bytes+= kernel_width*kernel_channels_pad*input_size; + } + + p_mem += cir_buf_size_bytes; + p_state->cir_buf.p_end = p_mem; + + AE_SETCBEGIN0(p_state->cir_buf.p_begin); + AE_SETCEND0(p_state->cir_buf.p_end); + +} + +VOID xa_nn_conv2d_dilation_init_state( + VOID *p_scratch, + VOID *p_kernel, + VOID *p_input) +{ + WORD8 *p_mem = (WORD8 *)p_scratch; + xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_mem; + + p_mem += sizeof(xa_nn_conv_state_t); + p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT); + + + if(((UWORD32)p_kernel & BUS_WIDTH_MASK) == ((UWORD32)p_mem & BUS_WIDTH_MASK)) + { + p_mem += BUS_WIDTH; /* Add a offset to avoid banking stall */ + } + p_state->cir_buf.p_base = p_mem; + p_state->p_inp_base = p_input; +} + + +VOID xa_nn_dilated_conv2d_std_init_circ_buf( + VOID *p_scratch, + VOID *p_kernel, + WORD32 input_height, + WORD32 input_channels, + WORD32 kernel_height_dilation, + WORD32 kernel_width, + WORD32 y_stride, + WORD32 y_padding, + WORD32 out_height, + WORD32 output_channels, + WORD32 dilation_height, + WORD32 dilation_h_offset, + WORD32 input_precision, + WORD32 kernel_precision) +{ + WORD8 *p_mem;// = (WORD8 *)p_scratch; + xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_scratch; + size_t input_size = 0; + UWORD32 align_size = 0; + WORD32 input_channels_pad; + + switch(input_precision) + { + case 8: + case -4: + input_size = sizeof(WORD8); + align_size = ALIGNMENT>>1; + break; + case 16: + input_size = sizeof(WORD16); + align_size = ALIGNMENT>>1; + break; + case -1: + input_size = sizeof(WORD32); + align_size = ALIGNMENT>>2; + break; + case -3: + input_size = sizeof(UWORD8); + align_size = ALIGNMENT>>1; + break; + default: + break; + } + + p_state->cir_buf.p_begin = p_state->cir_buf.p_base; + p_state->cir_buf.p_curr = p_state->cir_buf.p_begin; + + p_mem = p_state->cir_buf.p_begin; + + // Computing circular buffer size + // Determine y-bottom padding +#if !ENABLE_PADDING_CONV2D_STD + if(input_precision == PREC_8 || input_precision == PREC_ASYM8U || input_precision == PREC_ASYM8S) //TODO: remove the condition when the padding requirement is removed for other variants. + input_channels_pad = input_channels; + else +#endif + input_channels_pad = PADDED_SIZE(input_channels, align_size); + + // calculate height for this offset case + WORD32 y_b_pad_total = kernel_height_dilation + (out_height - 1) * y_stride - (y_padding + input_height); + y_b_pad_total = y_b_pad_total < 0 ? 0 : y_b_pad_total; + + WORD32 total_height = (y_padding + input_height + y_b_pad_total); + WORD32 height = (total_height/dilation_height) + (WORD32) (((total_height%dilation_height)-1)>=dilation_h_offset); + + WORD32 cir_buf_size_bytes = height * kernel_width * input_channels_pad * input_size; + + while(cir_buf_size_bytes%16 !=0) + { + cir_buf_size_bytes+= kernel_width*input_channels_pad*input_size; + } + + p_mem += cir_buf_size_bytes; + p_state->cir_buf.p_end = p_mem; + + AE_SETCBEGIN0(p_state->cir_buf.p_begin); + AE_SETCEND0(p_state->cir_buf.p_end); + + p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT); + + p_state->p_kernel_padded = (void *)p_kernel; + +#if !ENABLE_PADDING_CONV2D_STD + if( (input_precision != PREC_ASYM8S) && + (input_precision != PREC_F32) && + (input_precision != PREC_16) && + (input_channels_pad != input_channels) ) +#else + if( (input_precision != PREC_F32) && + (input_precision != PREC_16) && + (input_channels_pad != input_channels) ) +#endif + { + int oc, kh, kw, kernel_size; + p_state->p_kernel_padded = (void *)p_mem; + + switch(kernel_precision) + { + case 8: + kernel_size = sizeof(WORD8); + break; + case 16: + kernel_size = sizeof(WORD16); + break; + case -1: + kernel_size = sizeof(WORD32); + break; + case -3: + kernel_size = sizeof(UWORD8); + break; + case -4: + case -5: + kernel_size = sizeof(WORD8); + break; + default: + kernel_size = 0; + break; + } + + pWORD8 p_src = (pWORD8) p_kernel; + pWORD8 p_dst = (pWORD8) p_state->p_kernel_padded; + + for(oc = 0; oc < output_channels; oc++) + { + for(kh = 0; kh < kernel_height_dilation; kh++) + { + for(kw = 0; kw < kernel_width; kw++) + { + memcpy(p_dst, p_src, kernel_size * input_channels); + p_dst += kernel_size * input_channels; + p_src += kernel_size * input_channels; + + memset(p_dst, 0, kernel_size * (input_channels_pad - input_channels)); + p_dst += kernel_size * (input_channels_pad - input_channels); + } + } + } + } +} + +VOID conv2d_std_init_cir_buf( + WORD32 input_channels, + WORD32 input_channels_pad, + WORD32 input_bytewidth, + WORD32 input_width, + WORD32 input_height, + WORD32 y_padding, + WORD32 y_b_pad, + WORD32 x_padding, + WORD32 kernel_width, + WORD32 x_stride, + VOID **pp_inp, + xa_nn_conv_state_t *p_state) +{ + WORD32 i,k; + WORD8 *p_inp = (WORD8 *)*pp_inp; + WORD32 planes_to_add = x_stride > kernel_width ? 0 : kernel_width - x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth); + + // Initialize circular buffer + // Set first 'y_padding' rows of cir_buf to zero + for(i=0;i kernel_width ? 0 : kernel_width - x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + UWORD8 pad_val_u8 = (UWORD8)pad_val; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth); + + // Initialize circular buffer + // Set first 'y_padding' rows of cir_buf to zero + for(i=0;i kernel_width ? kernel_width : x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + + // Copy 'planes_to_add' planes of data to circular buffer + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad * input_bytewidth); + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth); + + // Set first 'y_padding' rows of cir_buf to zero + for(i=0;i kernel_width + WORD32 copy_x_r_pad_width = 0; + if(idx_beg_inp_width_pad < x_padding) + { + copy_x_pad_width = x_padding - idx_beg_inp_width_pad; + copy_inp_width = idx_end_inp_width_pad - x_padding; + copy_x_r_pad_width = XT_MAX(0, copy_inp_width - input_width); + copy_inp_width = XT_MIN(copy_inp_width, input_width); + } + else if(idx_end_inp_width_pad <= x_padding + input_width) + { + copy_inp_width = planes_to_add; + } + else if(idx_beg_inp_width_pad < x_padding + input_width) + { + copy_inp_width = x_padding + input_width - idx_beg_inp_width_pad; + copy_x_r_pad_width = idx_end_inp_width_pad - (x_padding + input_width); + } + else + { + copy_x_r_pad_width = planes_to_add; + } + + const int size1 = input_channels * input_bytewidth; + const int size2 = (input_channels_pad - input_channels) * input_bytewidth; + const int size3 = input_channels_pad * input_bytewidth; + if( (size1 < 32) && (size2 < 32) && (size3 < 32)){ + /* This case handle smaller sizes (<32) in which the functions memset/memcpy are not to be called */ + for(i=0;i kernel_width ? kernel_width : x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + + // Copy 'planes_to_add' planes of data to circular buffer + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * kernel_channels_pad * input_bytewidth); + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth); + + // Set first 'y_padding' rows of cir_buf to zero + for(i=0;i kernel_width + WORD32 copy_x_r_pad_width = 0; + if(idx_beg_inp_width_pad < x_padding) + { + copy_x_pad_width = x_padding - idx_beg_inp_width_pad; + copy_inp_width = idx_end_inp_width_pad - x_padding; + copy_x_r_pad_width = XT_MAX(0, copy_inp_width - input_width); + copy_inp_width = XT_MIN(copy_inp_width, input_width); + } + else if(idx_end_inp_width_pad <= x_padding + input_width) + { + copy_inp_width = planes_to_add; + } + else if(idx_beg_inp_width_pad < x_padding + input_width) + { + copy_inp_width = x_padding + input_width - idx_beg_inp_width_pad; + copy_x_r_pad_width = idx_end_inp_width_pad - (x_padding + input_width); + } + else + { + copy_x_r_pad_width = planes_to_add; + } + + const int size1 = kernel_channels * input_bytewidth; + const int size2 = (kernel_channels_pad - kernel_channels) * input_bytewidth; + const int size3 = kernel_channels_pad * input_bytewidth; + + if ((kernel_channels <= 16) && ((kernel_channels_pad-kernel_channels) <= 16) && (kernel_channels_pad <= 16)) + { + for(i=0;i kernel_width ? kernel_width : x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + + // Copy 'planes_to_add' planes of data to circular buffer + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad * input_bytewidth); + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth); + + // Set next 'input_height' rows of cir_buf with zero (from x_padding) and/or input data and/or zero (from x-right padding) + //WORD32 idx_end_inp_width_pad = idx_beg_inp_width_pad + planes_to_add; + WORD32 copy_inp_width = planes_to_add; + WORD32 to_skip_inp_width = x_stride - planes_to_add; // Non-zero for x_stride > kernel_width + + int size = input_channels * input_bytewidth; + if (size <= 32) { + for(i=0;i kernel_width ? kernel_width : x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + + // Copy 'planes_to_add' planes of data to circular buffer + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * kernel_channels_pad * input_bytewidth); + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth); + + // Set next 'input_height' rows of cir_buf with zero (from x_padding) and/or input data and/or zero (from x-right padding) + //WORD32 idx_end_inp_width_pad = idx_beg_inp_width_pad + planes_to_add; + WORD32 copy_inp_width = planes_to_add; + WORD32 to_skip_inp_width = x_stride - planes_to_add; // Non-zero for x_stride > kernel_width + + int size = kernel_channels * input_bytewidth; + const int size2 = (kernel_channels_pad - kernel_channels) * input_bytewidth; + if ((kernel_channels <= 16) && ((kernel_channels_pad-kernel_channels) <= 16)) { + for(i=0;i kernel_width ? 0 : kernel_width - x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + //ae_int8x8 zero_pad = AE_MOVDA8(pad_val); + UWORD8 pad_val_u8 = (UWORD8)pad_val; + //ae_int8x8 inp_val; + (void) input_bytewidth; + WORD32 y_padding_dilation; + + if(!firstCall) + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad); + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad); + + WORD32 indexCorrectionDoneInHeight = 1; + WORD32 heightIndexIterationModified = heightIndexIteration; + y_padding_dilation = (y_padding / dilation_height) + (WORD32)(((y_padding%dilation_height)-1)>=dilation_h_offset); + WORD32 y_padding_dilation_indexCorrected = y_padding_dilation - heightIndexIteration; + if(y_padding_dilation_indexCorrected<0) + { + indexCorrectionDoneInHeight = 0; + heightIndexIterationModified = -y_padding_dilation_indexCorrected; + y_padding_dilation_indexCorrected = 0; + } + *circMatrixHeight = 0; + *circMatrixHeight = *circMatrixHeight + y_padding_dilation_indexCorrected; + // Initialize circular buffer + /*if(input_channels == 1) + { + // Set first 'y_padding' rows of cir_buf to zero + for(i=0;i= dilation_w_offset);//This is the contribution of zero padding(in total) towards this width offset + WORD32 x_padding_dilation_initial_pad = ((x_padding_full-x_padding)/dilation_width) + (WORD32) ( (((x_padding_full-x_padding)%dilation_width)-1) >= dilation_w_offset); /// This offset's contribution which has been absorbed in initial analysis of zero padding + WORD32 x_padding_dilation = x_padding_full_dilation - x_padding_dilation_initial_pad;//This is the num of zeros contribution from left padding for this dilation offset + WORD32 indexCorrectionDoneInWidth = 1; + WORD32 widthIndexIterationModified = widthIndexIteration; + //Accounting for initial width index/point in this sub-matrix for this offset (This arises from stride implementation) + WORD32 x_padding_dilation_postIndexCorrection = x_padding_dilation;// - widthIndexIteration;/// If this value lr. than zero implies first width-index inside this sub-matrix is inside input matrix after crossing left zero padding + + x_padding_dilation_postIndexCorrection = x_padding_dilation_postIndexCorrection - widthIndexIteration; + if(x_padding_dilation_postIndexCorrection<0) + { + indexCorrectionDoneInWidth = 0; + widthIndexIterationModified = -x_padding_dilation_postIndexCorrection; + x_padding_dilation_postIndexCorrection = 0; + } + else + { + indexCorrectionDoneInWidth = 1; + widthIndexIterationModified = 0; + } + + x_padding_dilation = x_padding_dilation_postIndexCorrection - (*input_padding_consumed); /// When this loop called repeatedly; some of the input will be consumed discounting for that + + if(x_padding_dilation<0) + x_padding_dilation = 0;/// This condition can occur when we are done with zero padding section in the prev. iteration(can be first iteration in corner case) + + + /// Calculate number of input width/columns remaining for this width offset which can participate in the convolution process + WORD32 x_padding_plus_input_dilation = ( (x_padding_full+input_width)/dilation_width) + (WORD32) ( (((x_padding_full+input_width)%dilation_width)-1) >= dilation_w_offset);//This is the num elements to be convolved for this offset in total(zeropad+input) + WORD32 x_input_dilation = x_padding_plus_input_dilation - x_padding_full_dilation;// This is the number of elements from input that can potentially be populated + WORD32 x_input_dilation_postIndexCorrection; + WORD32 input_width_correction; + + if(indexCorrectionDoneInWidth==0) + { + x_input_dilation_postIndexCorrection = x_input_dilation - widthIndexIterationModified; // this value if -ve correction flows towards right z.p + if(x_input_dilation_postIndexCorrection<0) + { + indexCorrectionDoneInWidth = 0; + widthIndexIterationModified = -x_input_dilation_postIndexCorrection; + x_input_dilation_postIndexCorrection = 0; + input_width_correction = x_input_dilation; + } + else + { + indexCorrectionDoneInWidth = 1; + input_width_correction = widthIndexIterationModified; + widthIndexIterationModified = 0; + + } + } + else + { + x_input_dilation_postIndexCorrection = x_input_dilation; + input_width_correction = 0; + } + + WORD32 x_input_dilation_postIndexCorrection_total = x_input_dilation_postIndexCorrection;/// This is the total convoble area after adjustng for stride offset for this dilation_offset + x_input_dilation_postIndexCorrection = x_input_dilation_postIndexCorrection - (*input_width_consumed);//consumedInput;/// When this loop called repeatedly; some of the input will be consumed discounting for that + if(x_input_dilation_postIndexCorrection<0) + x_input_dilation_postIndexCorrection = 0;/// This implies the control is to right padding + + WORD32 copy_x_pad_width, copy_x_r_pad_width, copy_inp_width; + + if(planes_to_add <= x_padding_dilation) + { + copy_x_pad_width = planes_to_add; + copy_inp_width = 0; + copy_x_r_pad_width = 0; + } + else if(planes_to_add <= (x_padding_dilation+x_input_dilation_postIndexCorrection) ) + { + copy_x_pad_width = x_padding_dilation; + copy_inp_width = planes_to_add - copy_x_pad_width; + copy_x_r_pad_width = 0; + } + else + { + copy_x_pad_width = x_padding_dilation; + copy_inp_width = x_input_dilation_postIndexCorrection; + copy_x_r_pad_width = planes_to_add - (copy_x_pad_width+copy_inp_width) ;/// No need to calculate the right padding exactly as the loop outside i.e, calling function takes care of it + } + + { + // estimate total number of height values for height_offset value from the input matrix + WORD32 input_padding_plus_height_dilation = ( (y_padding+input_height) / dilation_height) + (WORD32)((((y_padding+input_height)%dilation_height)-1)>=dilation_h_offset); + WORD32 input_height_dilation = input_padding_plus_height_dilation - y_padding_dilation;//y_padding_dilation; /// This value is the height of the circular matrix that has to be iterated for non-zero input values i.e., without top padding and bottim padding iterations + WORD32 input_height_dilation_indexCorrected = input_height_dilation; + WORD32 input_height_correction = 0; + if(indexCorrectionDoneInHeight==0) + { + input_height_dilation_indexCorrected = input_height_dilation_indexCorrected - heightIndexIterationModified; + if(input_height_dilation_indexCorrected<0) + { + indexCorrectionDoneInHeight = 0; + heightIndexIterationModified = -input_height_dilation_indexCorrected; + input_height_dilation_indexCorrected = 0; + input_height_correction = input_height_dilation; + } + else + { + indexCorrectionDoneInHeight = 1; + input_height_correction = heightIndexIterationModified; + heightIndexIterationModified = 0; + + } + } + *circMatrixHeight = *circMatrixHeight + input_height_dilation_indexCorrected; + + /// estimate the offset needed in the input matrix for this height offset + WORD32 index_0_input_dilation_height_offset = (y_padding % dilation_height) ; ///This value represent 0th index in input matrix (post top padding) correspond to which offset in height's dilation scale + WORD32 input_offset_height_dilation = (dilation_h_offset - index_0_input_dilation_height_offset + dilation_height)%dilation_height;// "index_0_input_dilation_height_offset" represent the dilation offset corresponding to 0 th row of input but, the target is to reach "dilation_h_offset" in dilation scale. This calculation helps reach there from "index_0_input_dilation_height_offset" + + p_inp = p_inp + (input_offset_height_dilation * input_width * input_channels); // This offsets the pointer as per the dilation offset in height dimension for stride=1. While supporting stride find the point inside sub matrix that is the starting point + p_inp = p_inp + (input_height_correction * dilation_height * input_width * input_channels);///This accounts for offset i.e., initial index that arises out of stride support + /// In the above calculation of pointer ystride is not brought into calculation, in height dimension Ystride will be handled by core convolution code + + //for(i=0;i0) && (x_stride_dilated>kernel_width) ) + *input_width_consumed = *input_width_consumed + x_stride_dilated - copy_x_pad_width;/// Account for stride consumption only if there was any consumption. Reduce whatever was consumed in left zp + else + *input_width_consumed = *input_width_consumed + copy_inp_width; + + if(x_input_dilation_postIndexCorrection_total < (*input_width_consumed) ) + *input_width_consumed = x_input_dilation_postIndexCorrection_total; + + + if ( (copy_x_pad_width >0) && (x_stride_dilated>kernel_width) ) + *input_padding_consumed = *input_padding_consumed + x_stride_dilated ; + else + *input_padding_consumed = *input_padding_consumed + copy_x_pad_width ; + + if(x_padding_dilation_postIndexCorrection < (*input_padding_consumed) ) + *input_padding_consumed = x_padding_dilation_postIndexCorrection; + + /// Similar consumption calculation is not needed for right padding. This is because in right padding number of points will be lesser than kernel width as the outside function would have absolved all other right padding indices implying there would not be more than one call to fill right padding as a part of circular matrix loading + + + WORD32 input_height_toppadding_plus_input_plus_bottom_padding = ((y_padding+input_height+y_b_pad) / dilation_height) + (WORD32)((((y_padding+input_height+y_b_pad)%dilation_height)-1)>=dilation_h_offset);// This is the total number of input points used for convolution for this height offset value + WORD32 y_b_pad_dilation = input_height_toppadding_plus_input_plus_bottom_padding - (y_padding_dilation+input_height_dilation);/// This calculates number of bottom padding points for this dilation offset i.e., dilation_h_offset + + WORD32 input_bpadding_dilation_indexCorrected = y_b_pad_dilation; + + if(indexCorrectionDoneInHeight==0) + { + input_bpadding_dilation_indexCorrected = input_bpadding_dilation_indexCorrected - heightIndexIterationModified; + } + *circMatrixHeight = *circMatrixHeight + input_bpadding_dilation_indexCorrected; + // Set last 'y_b_pad' rows of cir_buf to zero + for(i=0;i kernel_width ? 0 : kernel_width - x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + UWORD8 pad_val_u8 = (UWORD8)pad_val; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth); + + // Initialize circular buffer + // Set first 'y_padding' rows of cir_buf to zero + for(i=0;i kernel_width ? kernel_width : x_stride; + WORD32 planes_to_keep = kernel_width - planes_to_add; + + // Copy 'planes_to_add' planes of data to circular buffer + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad * input_bytewidth); + WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr; + AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth); + + // Set first 'y_padding' rows of cir_buf to zero + for(i=0;i kernel_width + WORD32 copy_x_r_pad_width = 0; + if(idx_beg_inp_width_pad < x_padding) + { + copy_x_pad_width = x_padding - idx_beg_inp_width_pad; + copy_inp_width = idx_end_inp_width_pad - x_padding; + copy_x_r_pad_width = XT_MAX(0, copy_inp_width - input_width); + copy_inp_width = XT_MIN(copy_inp_width, input_width); + } + else if(idx_end_inp_width_pad <= x_padding + input_width) + { + copy_inp_width = planes_to_add; + } + else if(idx_beg_inp_width_pad < x_padding + input_width) + { + copy_inp_width = x_padding + input_width - idx_beg_inp_width_pad; + copy_x_r_pad_width = idx_end_inp_width_pad - (x_padding + input_width); + } + else + { + copy_x_r_pad_width = planes_to_add; + } + + const int size1 = input_channels * input_bytewidth; + const int size2 = (input_channels_pad - input_channels) * input_bytewidth; + const int size3 = input_channels_pad * input_bytewidth; + if( (size1 < 16) && (size2 < 16) && (size3 < 16)){ + /* This case handle smaller sizes (<16) in which the functions memset/memcpy are not to be called */ + for(i=0;i 31) || (out_shift < -31)) + { + return -1; + } + + if (!p_bias) + { + return -1; + } + +#define UNROLL_ROW_SETUP_ACC_BATCH SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b +#define UNROLL_SETUP_ACC_BATCH SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b +#define UNROLL_SETUP_MAT1 SETUP_MAT1_ASYM8b +#define UNROLL_SETUP_VEC_BATCH SETUP_VEC_BATCH_ASYM8b +#define UNROLL_ROW_SETUP_BIAS_BATCH SETUP_BIAS_BATCH_ROW_ASYM8b +#define UNROLL_SETUP_BIAS_BATCH SETUP_BIAS_BATCH_ASYM8b +#define UNROLL_LOAD_VEC_BATCH LOAD_VEC_BATCH_ASYM8b +#define UNROLL_LOAD_ROW_MAT1 LOAD_ROW_MAT1_ASYM8b +#define UNROLL_ROW_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b +#define UNROLL_KERNEL_MAT1_VEC_BATCH KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b +#define UNROLL_ROW_ADD_BIAS_ACC ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b +#define UNROLL_ADD_BIAS_ACC_BATCH ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b +#define UNROLL_ROW_ADJUST_ACC ADJUST_ACC_BATCH_ROW_ASYM8b +#define UNROLL_ADJUST_ACC_BATCH ADJUST_ACC_BATCH_ASYM8b +#define UNROLL_ROW_STORE_ACC STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b +#define UNROLL_STORE_ACC_BATCH STORE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b + +#if TFLITE_SINGLE_ROUNDING + left_shift = out_shift; + right_shift = out_shift; + /* Single rounding macro doesn't need two shifts so this is not used */ + (void)right_shift; +#else /* #if TFLITE_SINGLE_ROUNDING */ + left_shift = out_shift<0?0:out_shift; + right_shift = out_shift>0?0:-out_shift; +#endif /* #if TFLITE_SINGLE_ROUNDING */ + + if(p_mat1 && p_vec1) + { + for(vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL-1)); vec_itr+= VEC_UNROLL) + { + for(m_itr = 0; m_itr < (rows & ~(ROW_UNROLL-1)); m_itr += ROW_UNROLL) + { + SETUP_BIAS_BATCH; + SETUP_ACC_BATCH; + SETUP_VEC_BATCH; + SETUP_MAT1; + + for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++) + { + LOAD_VEC_BATCH; + LOAD_MAT1; + KERNEL_MAT1_VEC_BATCH; + } + ADD_BIAS_ACC_BATCH; + ADJUST_ACC_BATCH; + STORE_ACC_BATCH; + } + for(; m_itr < rows; m_itr++) + { + UNROLL_ROW_SETUP_BIAS_BATCH(0); + UNROLL_ROW_SETUP_ACC_BATCH(0); + SETUP_VEC_BATCH; + UNROLL_SETUP_MAT1(0); + + for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++) + { + LOAD_VEC_BATCH; + UNROLL_LOAD_ROW_MAT1(0); + UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0); + } + UNROLL_ROW_ADD_BIAS_ACC(0); + UNROLL_ROW_ADJUST_ACC(0); + UNROLL_ROW_STORE_ACC(0); + } + } + /* Tail loop for vec unroll */ + for(; vec_itr < vec_count; vec_itr++) + { + for(m_itr = 0; m_itr < (rows & ~(ROW_UNROLL-1)); m_itr += ROW_UNROLL) + { + SETUP_BIAS_BATCH_TAIL; + SETUP_ACC_BATCH_TAIL; + UNROLL_SETUP_VEC_BATCH(0); + SETUP_MAT1; + + for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++) + { + UNROLL_LOAD_VEC_BATCH(0); + LOAD_MAT1; + KERNEL_MAT1_VEC_BATCH_TAIL; + } + ADD_BIAS_ACC_BATCH_TAIL; + ADJUST_ACC_BATCH_TAIL; + STORE_ACC_BATCH_TAIL; + } + for(; m_itr < rows; m_itr++) + { + UNROLL_SETUP_BIAS_BATCH(0,0); + UNROLL_SETUP_ACC_BATCH(0,0); + UNROLL_SETUP_VEC_BATCH(0); + UNROLL_SETUP_MAT1(0); + + for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++) + { + UNROLL_LOAD_VEC_BATCH(0); + UNROLL_LOAD_ROW_MAT1(0); + UNROLL_KERNEL_MAT1_VEC_BATCH(0,0); + } + UNROLL_ADD_BIAS_ACC_BATCH(0,0); + UNROLL_ADJUST_ACC_BATCH(0,0); + UNROLL_STORE_ACC_BATCH(0,0); + } + } + } + else + { + return -1; + } + return 0; +} \ No newline at end of file