diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 582bf178bff..c04dd1fafd3 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -125,6 +125,11 @@
     - arg_meta: null
       kernel_name: impl::HiFi::dequantize_per_tensor_out
 
+- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv_out
 
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 90cd814e1e5..db2143d0c93 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -9,6 +9,9 @@ add_library(
   cadence_kernels
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_matXvec_asym8xasym8_asym8_circ.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
@@ -28,6 +31,7 @@ target_include_directories(
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include
     ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/
+    ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/kernels/cnn/hifi4/
 	${_common_include_directories}
 )
 
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 7601d969447..2396bc4b414 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -16,6 +16,32 @@
 #include "xa_nnlib_kernels_api.h"
 
 /* Potential NNLIB function/APIs */
+extern "C" WORD32 xa_nn_conv2d_per_chan_asym8xasym8(UWORD8* __restrict__ p_out,
+                                const UWORD8* __restrict__ p_inp,
+                                const UWORD8* __restrict__ p_kernel,
+                                const WORD32* __restrict__ p_bias,
+                                WORD32 input_height,
+                                WORD32 input_width,
+                                WORD32 input_channels,
+                                WORD32 kernel_height,
+                                WORD32 kernel_width,
+                                WORD32 kernel_channels,
+                                WORD32 dilation_height,
+                                WORD32 dilation_width,
+                                WORD32 out_channels,
+                                WORD32 x_stride,
+                                WORD32 y_stride,
+                                WORD32 x_padding,
+                                WORD32 y_padding,
+                                WORD32 out_height,
+                                WORD32 out_width,
+                                WORD32 input_zero_bias,
+                                WORD32 * p_out_multiplier,
+                                WORD32 * p_out_shift,
+                                WORD32 out_zero_bias,
+                                WORD32 out_data_format,
+                                VOID *p_scratch);
+    
 extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
                                 const WORD32 *const p_out_shape,
                                 const FLOAT32 * __restrict__ p_inp1,
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 0bd117771f9..2bcf4321d48 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -66,6 +66,7 @@ target_include_directories(
 add_library(
   custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
              "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp"
+             "quantized_conv_out.cpp"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/hifi/operators/quantized_conv_out.cpp b/backends/cadence/hifi/operators/quantized_conv_out.cpp
new file mode 100644
index 00000000000..b40becd1e40
--- /dev/null
+++ b/backends/cadence/hifi/operators/quantized_conv_out.cpp
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "kernels.h"
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <algorithm>
+#include <cmath>
+
+#define ALIGN_PTR(x, bytes)     ((((unsigned)(x))+(bytes-1))&(~(bytes-1)))
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+using ScalarType = exec_aten::ScalarType;
+
+
+// This implements a generic 2d conv kernel that operates on raw pointers.
+// The version handles both quantized and fp32 convolutions.
+// The input is of shape [n x c x h x w]
+// The weight is of shape [oc x wc x wh x ww], where wc == c
+// The output is of shape [n x oc x oh x ow]
+// The bias is of shape [oc]
+template <typename IT, typename WT, typename BT, typename OT, bool quantized>
+__attribute__((noinline)) void conv2d_nchw_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t c,
+    int32_t h,
+    int32_t w,
+    int32_t oc,
+    int32_t wc,
+    int32_t wh,
+    int32_t ww,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    const int32_t* __restrict__ weight_zero_point = nullptr,
+    const float* __restrict__ bias_scale = nullptr,
+    float out_scale = 1,
+    OT out_zero_point = 0,
+    bool per_tensor_quantized = true) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * c * h * w;
+    OT* out_batch = p_out + _n * oc * oh * ow;
+    // Compute separable convolution for each group
+    for (int _g = 0; _g < groups; ++_g) {
+      // Identify the input and output channels involved in the computation
+      // of this group
+      int sic = _g * icpg;
+      int soc = _g * ocpg;
+      // Populate all the output channels in the group
+      for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+        OT* out_plane = out_batch + _oc * oh * ow;
+        const WT* weight_batch = p_weight + _oc * wc * wh * ww;
+        // We compute one output channel at a time. The computation can be
+        // thought of as a stencil computation: we iterate over an input of size
+        // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an
+        // output channel of size 1 x oh x ow.
+        for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+          for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to the
+            // output channel being computed) with the corresponding weight
+            // channel.
+            // If the padding is 0, and dilation is 1, then we can remove the
+            // unnecessary checks, and simplify the code so that it can be
+            // vectorized by Tensilica compiler.
+            if (zero_pad_unit_dilation) {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    int ioff = (_h + _wh) * w + (_w + _ww);
+                    int woff = _wh * ww + _ww;
+                    float lhs = in_plane[ioff] - in_zero_point;
+                    float rhs = weight_plane[woff] -
+
+                      (quantized ? 0 : 0);
+                    /*float rhs = weight_plane[woff] -
+                    (quantized ? weight_zero_point[0] : 0);*/
+
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    if (((_h + d0 * _wh - p0) >= 0) &&
+                        ((_h + d0 * _wh - p0) < h) &&
+                        ((_w + d1 * _ww - p1) >= 0) &&
+                        ((_w + d1 * _ww - p1) < w)) {
+                        //((_w + d1 * _ww - p1 < w))) {
+
+                      int ioff =
+                          (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
+                      int woff = _wh * ww + _ww;
+                      float lhs = in_plane[ioff] - in_zero_point;
+                      float rhs = weight_plane[woff] -
+
+                      (quantized ? 0 : 0);
+                      /*float rhs = weight_plane[woff] -
+                      (quantized ? weight_zero_point[0] : 0);*/
+
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val =
+                  (per_tensor_quantized ? bias_scale[0] : bias_scale[_oc]) *
+                  acc;
+              out_plane[_oh * ow + _ow] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+            } else {
+              out_plane[_oh * ow + _ow] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// The quantized convolution kernel. in_scale and weight_scale are implicit in
+// bias_scale, since it is a product of the two. The kernel will branch to
+// quantized::conv1d or quantized::conv2d based on the dimensionality of
+// activation tensor.
+void quantized_conv_out(
+    RuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    exec_aten::IntArrayRef stride,
+    exec_aten::IntArrayRef padding,
+    exec_aten::IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    bool channel_last,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+    
+  if(input.scalar_type() == ScalarType::Char)
+  {
+    WORD8* __restrict__ p_out = (WORD8* __restrict__ )out.mutable_data_ptr<int8_t>();
+    WORD8* __restrict__ p_inp = (WORD8* __restrict__ )input.const_data_ptr<int8_t>();
+    WORD8* __restrict__ p_kernel = (WORD8* __restrict__ )weight.const_data_ptr<int8_t>();
+    WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr<int32_t>();
+    
+    WORD32 input_height = conv1d ? 1 : input.size(2);
+    WORD32 input_width = conv1d ? input.size(2) : input.size(3);
+    WORD32 input_channels = input.size(1);
+    WORD32 kernel_height = conv1d ? 1 : weight.size(2);
+    WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+    WORD32 kernel_channels = weight.size(1);
+    WORD32 out_channels = weight.size(0);
+    WORD32 out_height = conv1d ? 1 : out.size(2);
+    WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+    WORD32 batches = input.size(0);
+    
+    WORD32 x_stride = stride[1];
+    WORD32 y_stride = stride[0];
+    WORD32 x_padding = padding[1];
+    WORD32 y_padding = padding[0];
+    WORD32 dilation_width = 1;
+    WORD32 dilation_height = 1;
+    
+    WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr<int32_t>();
+    
+    WORD32 input_zero_bias = -in_zero_point;
+    WORD32 kernel_zero_bias = -kernel_bias_ptr[0];
+
+    WORD32 out_multiplier32[out_channels];
+    WORD32 out_shift32[out_channels];
+    
+    float out_scale = 1. / output_scale;
+
+    for(int i = 0; i < out_channels; i++)
+    {
+        out_multiplier32[i] = bias_scale.const_data_ptr<float>()[0] * out_scale * 2147483648;
+        out_shift32[i] = 0;
+    }
+
+    WORD32 out_zero_bias = output_zero_point;
+    WORD32 inp_precision = 8;
+    WORD32 kernel_precision = 8;
+    pVOID p_scratch = nullptr;
+    WORD32 *ptr_scratch;
+    
+    WORD32 scratch_size = 0;
+    
+    WORD32 out_data_format = 1;
+    
+    WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8));
+    WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8));
+    
+    WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8);
+    WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8);
+    
+    WORD32 p_inp_shape[4];
+    p_inp_shape[0] = input.size(0);
+    p_inp_shape[1] = input_channels;
+    p_inp_shape[2] = input_height;
+    p_inp_shape[3] = input_width;
+    
+    WORD32 p_out_shape[4];
+    p_out_shape[0] = input.size(0);
+    p_out_shape[1] = input_height;
+    p_out_shape[2] = input_width;
+    p_out_shape[3] = input_channels;
+    
+    WORD32 p_permute_vec[4] = {0, 2, 3, 1};
+    
+    WORD32 num_out_dims = 4;
+    WORD32 num_inp_dims = 4;
+    
+    WORD32 t = xa_nn_transpose_8_8(pin
+                      ,p_out_shape
+                      ,p_inp
+                      ,p_inp_shape
+                      ,p_permute_vec
+                      ,num_out_dims
+                      ,num_inp_dims);
+                      
+    WORD32 p_inp_shape1[4];
+    p_inp_shape1[0] = out_channels;
+    p_inp_shape1[1] = kernel_channels;
+    p_inp_shape1[2] = kernel_height;
+    p_inp_shape1[3] = kernel_width;
+    
+    WORD32 p_out_shape1[4];
+    p_out_shape1[0] = out_channels;
+    p_out_shape1[1] = kernel_height;
+    p_out_shape1[2] = kernel_width;
+    p_out_shape1[3] = kernel_channels;
+    
+    WORD32 p_permute_vec1[4] = {0, 2, 3, 1};
+    
+    WORD32 num_out_dims1 = 4;
+    WORD32 num_inp_dims1 = 4;
+    
+    WORD32 t1 = xa_nn_transpose_8_8(pkernel
+                      ,p_out_shape1
+                      ,p_kernel
+                      ,p_inp_shape1
+                      ,p_permute_vec1
+                      ,num_out_dims1
+                      ,num_inp_dims1);   
+    
+    scratch_size = xa_nn_conv2d_getsize(
+      input_height,
+      input_width,
+      input_channels,
+      kernel_height,
+      kernel_width,
+      kernel_channels,
+      dilation_height,
+      dilation_width,
+      y_stride,
+      y_padding,
+      x_stride,
+      x_padding,
+      out_height,
+      out_width,
+      out_channels,
+      inp_precision,
+      kernel_precision,
+      out_data_format);
+                                            
+    scratch_size=scratch_size<0?0:scratch_size;
+
+    ptr_scratch = (WORD32 *)malloc(scratch_size + 16);
+    
+    p_scratch = (xa_codec_handle_t)ALIGN_PTR(ptr_scratch, 8);
+    
+    for (int _n = 0; _n < batches; ++_n) {
+      WORD8 *in_batch = pin + _n * input_channels * input_height * input_width;
+      WORD8 *out_batch = p_out + _n * out_channels * out_height * out_width;
+    
+      WORD32 val = xa_nn_conv2d_per_chan_sym8sxasym8s
+        (out_batch
+        ,in_batch
+        ,pkernel
+        ,p_bias
+        ,input_height
+        ,input_width
+        ,input_channels
+        ,kernel_height
+        ,kernel_width
+        ,kernel_channels
+        ,dilation_height
+        ,dilation_width
+        ,out_channels
+        ,x_stride
+        ,y_stride
+        ,x_padding
+        ,y_padding
+        ,out_height
+        ,out_width
+        ,input_zero_bias
+        ,out_multiplier32
+        ,out_shift32
+        ,out_zero_bias
+        ,out_data_format
+        ,p_scratch
+      );
+    }
+    
+    free(ptr1);
+    free(ptr2);
+    free(ptr_scratch);
+  }
+  else if(input.scalar_type() == ScalarType::Byte)
+  {
+    UWORD8* __restrict__ p_out = (UWORD8* __restrict__ )out.mutable_data_ptr<uint8_t>();
+    UWORD8* __restrict__ p_inp = (UWORD8* __restrict__ )input.const_data_ptr<uint8_t>();
+    UWORD8* __restrict__ p_kernel = (UWORD8* __restrict__ )weight.const_data_ptr<uint8_t>();
+    WORD32* __restrict__ p_bias = (WORD32* __restrict__ )bias.const_data_ptr<int32_t>();
+    
+    WORD32 input_height = conv1d ? 1 : input.size(2);
+    WORD32 input_width = conv1d ? input.size(2) : input.size(3);
+    WORD32 input_channels = input.size(1);
+    WORD32 kernel_height = conv1d ? 1 : weight.size(2);
+    WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+    WORD32 kernel_channels = weight.size(1);
+    WORD32 out_channels = weight.size(0);
+    WORD32 out_height = conv1d ? 1 : out.size(2);
+    WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+    WORD32 batches = input.size(0);
+    
+    WORD32 x_stride = stride[1];
+    WORD32 y_stride = stride[0];
+    WORD32 x_padding = padding[1];
+    WORD32 y_padding = padding[0];
+    WORD32 dilation_width = 1;
+    WORD32 dilation_height = 1;
+    
+    WORD32 * kernel_bias_ptr = (WORD32 *)weight_zero_point.const_data_ptr<int32_t>();
+    
+    WORD32 input_zero_bias = -in_zero_point;
+    WORD32 kernel_zero_bias = -kernel_bias_ptr[0];
+
+    WORD32 out_multiplier32[out_channels];
+    WORD32 out_shift32[out_channels];
+    
+    float out_scale = 1. / output_scale;
+
+    for(int i = 0; i < out_channels; i++)
+    {
+        out_multiplier32[i] = bias_scale.const_data_ptr<float>()[0] * out_scale * 2147483648;
+        out_shift32[i] = 0;
+    }
+
+    WORD32 out_zero_bias = output_zero_point;
+    WORD32 inp_precision = -3;
+    WORD32 kernel_precision = -3;
+    pVOID p_scratch = nullptr;
+    WORD32 *ptr_scratch;
+    
+    WORD32 scratch_size = 0;
+    
+    WORD32 out_data_format = 1;
+
+    WORD8 *ptr1 = (WORD8 *)malloc(((input.size(0) * input_channels * input_height * input_width) + 8) * sizeof(WORD8));
+    WORD8 *ptr2 = (WORD8 *)malloc(((out_channels * kernel_channels * kernel_height * kernel_width) + 8) * sizeof(WORD8));
+    
+    WORD8 *pin = (WORD8 *)ALIGN_PTR(ptr1, 8);
+    WORD8 *pkernel = (WORD8 *)ALIGN_PTR(ptr2, 8);
+    
+    WORD32 p_inp_shape[4];
+    p_inp_shape[0] = input.size(0);
+    p_inp_shape[1] = input_channels;
+    p_inp_shape[2] = input_height;
+    p_inp_shape[3] = input_width;
+    
+    WORD32 p_out_shape[4];
+    p_out_shape[0] = input.size(0);
+    p_out_shape[1] = input_height;
+    p_out_shape[2] = input_width;
+    p_out_shape[3] = input_channels;
+    
+    WORD32 p_permute_vec[4] = {0, 2, 3, 1};
+    
+    WORD32 num_out_dims = 4;
+    WORD32 num_inp_dims = 4;
+    
+    WORD8 * p_tmp = (WORD8 *)p_inp;
+    
+    WORD32 t = xa_nn_transpose_8_8(pin
+                      ,p_out_shape
+                      ,p_tmp
+                      ,p_inp_shape
+                      ,p_permute_vec
+                      ,num_out_dims
+                      ,num_inp_dims);
+                      
+    WORD32 p_inp_shape1[4];
+    p_inp_shape1[0] = out_channels;
+    p_inp_shape1[1] = kernel_channels;
+    p_inp_shape1[2] = kernel_height;
+    p_inp_shape1[3] = kernel_width;
+    
+    WORD32 p_out_shape1[4];
+    p_out_shape1[0] = out_channels;
+    p_out_shape1[1] = kernel_height;
+    p_out_shape1[2] = kernel_width;
+    p_out_shape1[3] = kernel_channels;
+    
+    WORD32 p_permute_vec1[4] = {0, 2, 3, 1};
+    
+    WORD32 num_out_dims1 = 4;
+    WORD32 num_inp_dims1 = 4;
+    
+    WORD8 * p_tmp1 = (WORD8 *)p_kernel;
+    
+    WORD32 t1 = xa_nn_transpose_8_8(pkernel
+                      ,p_out_shape1
+                      ,p_tmp1
+                      ,p_inp_shape1
+                      ,p_permute_vec1
+                      ,num_out_dims1
+                      ,num_inp_dims1); 
+    
+    scratch_size = xa_nn_conv2d_getsize(
+      input_height,
+      input_width,
+      input_channels,
+      kernel_height,
+      kernel_width,
+      kernel_channels,
+      dilation_height,
+      dilation_width,
+      y_stride,
+      y_padding,
+      x_stride,
+      x_padding,
+      out_height,
+      out_width,
+      out_channels,
+      inp_precision,
+      kernel_precision,
+      out_data_format);
+                                            
+    scratch_size=scratch_size<0?0:(scratch_size);
+
+    ptr_scratch = (WORD32 *)malloc(scratch_size);
+    
+    p_scratch = (pVOID )ALIGN_PTR(ptr_scratch, 8);
+    
+    const UWORD8* __restrict__ p_inp1 = (const UWORD8* __restrict__ )pin;
+    const UWORD8* __restrict__ p_kernel1 = (const UWORD8* __restrict__ )pkernel;
+
+    for (int _n = 0; _n < batches; _n++) {
+      const UWORD8* __restrict__ in_batch = p_inp1 + _n * input_channels * input_height * input_width;
+      UWORD8* __restrict__ out_batch = p_out + _n * out_channels * out_height * out_width;
+      
+      xa_nn_conv2d_per_chan_asym8xasym8
+        (out_batch
+        ,in_batch
+        ,p_kernel1
+        ,p_bias
+        ,input_height
+        ,input_width
+        ,input_channels
+        ,kernel_height
+        ,kernel_width
+        ,kernel_channels
+        ,dilation_height
+        ,dilation_width
+        ,out_channels
+        ,x_stride
+        ,y_stride
+        ,x_padding
+        ,y_padding
+        ,out_height
+        ,out_width
+        ,input_zero_bias
+        ,out_multiplier32
+        ,out_shift32
+        ,out_zero_bias
+        ,out_data_format
+        ,p_scratch
+      );
+    }
+    
+    free(ptr1);
+    free(ptr2);
+    free(ptr_scratch);
+  }
+  else
+  { 
+    // input = [n, c, h, w]
+    const int n = input.size(0);
+    const int c = input.size(1);
+    const int h = conv1d ? 1 : input.size(2);
+    const int w = conv1d ? input.size(2) : input.size(3);
+    // weight = [oc, wc, wh, ww]
+    const int oc = weight.size(0);
+    const int wc = weight.size(1);
+    const int wh = conv1d ? 1 : weight.size(2);
+    const int ww = conv1d ? weight.size(2) : weight.size(3);
+    // output = [n, oc, oh, ow]
+    const int oh = conv1d ? 1 : out.size(2);
+    const int ow = conv1d ? out.size(2) : out.size(3);
+    
+    // Bool flag to check if weight tensor is quantized per-tensor or
+    // per-channel
+    bool per_tensor_quantized = bias_scale.numel() == 1;
+    
+    if(input.scalar_type() == ScalarType::Char)
+    {
+        conv2d_nchw_core_generic<int8_t, int8_t, int32_t, int8_t, true>(
+            input.const_data_ptr<int8_t>(),
+            weight.const_data_ptr<int8_t>(),
+            bias.const_data_ptr<int32_t>(),
+            out.mutable_data_ptr<int8_t>(),
+            n,
+            c,
+            h,
+            w,
+            oc,
+            wc,
+            wh,
+            ww,
+            oh,
+            ow,
+            stride[0],
+            stride[1],
+            padding[0],
+            padding[1],
+            1,//dilation[0],
+            1,//dilation[1],
+            groups,
+            in_zero_point,
+            weight_zero_point.const_data_ptr<int32_t>(),
+            bias_scale.const_data_ptr<float>(),
+            output_scale,
+            (int8_t)output_zero_point,
+            per_tensor_quantized);
+        
+    }
+    else if(input.scalar_type() == ScalarType::Byte)
+    {
+        conv2d_nchw_core_generic<uint8_t, uint8_t, int32_t, uint8_t, true>(
+            input.const_data_ptr<uint8_t>(),
+            weight.const_data_ptr<uint8_t>(),
+            bias.const_data_ptr<int32_t>(),
+            out.mutable_data_ptr<uint8_t>(),
+            n,
+            c,
+            h,
+            w,
+            oc,
+            wc,
+            wh,
+            ww,
+            oh,
+            ow,
+            stride[0],
+            stride[1],
+            padding[0],
+            padding[1],
+            1,//dilation[0],
+            1,//dilation[1],
+            groups,
+            in_zero_point,
+            weight_zero_point.const_data_ptr<int32_t>(),
+            bias_scale.const_data_ptr<float>(),
+            output_scale,
+            (uint8_t)output_zero_point,
+            per_tensor_quantized);
+    }
+    else
+    {
+        ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type());
+    }
+  }
+
+}
+
+}; // namespace native
+}; // namespace HiFi
+}; // namespace impl
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c
new file mode 100644
index 00000000000..111a90d8e10
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_asym8xasym8.c
@@ -0,0 +1,469 @@
+#include "xa_nnlib_common.h"
+#include "xa_nnlib_common_macros.h"
+#include "xa_nn_conv2d_std_state.h"
+
+static WORD32 conv_x_left_pad(
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    WORD32 out_width,
+    WORD32 out_height,
+    WORD32 out_channels,
+    WORD32 out_channels_offset,
+    WORD32 out_width_offset,
+    WORD32 out_height_offset,
+    const WORD32* __restrict__ p_bias,
+    WORD8 *p_out,
+    WORD32 * p_out_multiplier,
+    WORD32 * p_out_shift,
+    WORD32 out_zero_bias)
+{
+  WORD32 i,j,k;
+  WORD32 out_width_over_x_pad = (x_padding - kernel_width)/x_stride + 1;
+  WORD32 left_shift, right_shift;
+  out_width_over_x_pad = out_width_over_x_pad > out_width ? out_width : out_width_over_x_pad;
+
+  ae_int32x2 max_int8 = AE_MOVDA32(255);
+  ae_int32x2 min_int8 = AE_MOVDA32(0);
+
+  /* When kernel convolves over x-left pad region only, output is just bias */
+  for(i = 0; i < out_height; i++)
+  {
+    for(j = 0; j < out_width_over_x_pad; j++)
+    {
+      for(k = 0; k < out_channels; k++)
+      {
+#if TFLITE_SINGLE_ROUNDING
+        left_shift  = p_out_shift[k];
+        /* Single rounding macro doesn't need two shifts so this is not used */
+        (void)right_shift;
+#else /* #if TFLITE_SINGLE_ROUNDING */
+        left_shift  = p_out_shift[k] < 0 ? 0 : p_out_shift[k];
+        right_shift = p_out_shift[k] > 0 ? 0 : -p_out_shift[k];
+#endif /* #if TFLITE_SINGLE_ROUNDING */          
+        ae_int32x2 acc;
+#if XCHAL_HAVE_HIFI1
+        if(p_bias != NULL){
+          acc = AE_L32_I((ae_int32*)&p_bias[k], 0);
+        }
+        else{
+          acc = AE_MOVDA32(0);
+        }
+        MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift);
+        acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias));
+        acc = AE_MAX32(acc, min_int8);
+        acc = AE_MIN32(acc, max_int8);
+        AE_S8_0_X_HIFI1( AE_MOVINT16X4_FROMINT32X2(acc), (WORD8 *)p_out, (i * out_height_offset + j * out_width_offset + k * out_channels_offset));
+#else
+        if(p_bias != NULL){
+          acc = AE_MOVDA32(p_bias[k]);
+        }
+        else{
+          acc = AE_MOVDA32(0);
+        }
+        MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift);
+        acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias));
+#if 0
+        AE_MINMAX32(acc, min_int8, max_int8);
+#else
+        acc = AE_MAX32(acc, min_int8);
+        acc = AE_MIN32(acc, max_int8);
+#endif
+        p_out[i * out_height_offset + j * out_width_offset + k * out_channels_offset] = (UWORD8)AE_MOVAD32_L(acc);
+#endif
+      }
+    }
+  }
+  return out_width_over_x_pad;
+}
+
+static WORD32 conv_x_right_pad(
+    WORD32 x_padding,
+    WORD32 input_width,
+    WORD32 x_stride,
+    WORD32 out_width,
+    WORD32 out_height,
+    WORD32 out_channels,
+    WORD32 out_channels_offset,
+    WORD32 out_width_offset,
+    WORD32 out_height_offset,
+    const WORD32* __restrict__ p_bias,
+    WORD8 *p_out,
+    WORD32 * p_out_multiplier,
+    WORD32 * p_out_shift,
+    WORD32 out_zero_bias)
+{
+  WORD32 i,j,k;
+  WORD32 idx_out_width_over_x_r_pad = (x_padding + input_width + x_stride - 1)/x_stride + 1;
+  WORD32 left_shift, right_shift;
+  WORD32 out_width_over_x_r_pad = out_width - idx_out_width_over_x_r_pad;
+
+  ae_int32x2 max_int8 = AE_MOVDA32(255);
+  ae_int32x2 min_int8 = AE_MOVDA32(0);
+
+  /* When kernel convolves over x-right pad region only, output is just bias */
+  for(i = 0; i < out_height; i++)
+  {
+    for(j = idx_out_width_over_x_r_pad; j < out_width; j++)
+    {
+      for(k = 0; k < out_channels; k++)
+      {
+#if TFLITE_SINGLE_ROUNDING
+        left_shift  = p_out_shift[k];
+        /* Single rounding macro doesn't need two shifts so this is not used */
+        (void)right_shift;
+#else /* #if TFLITE_SINGLE_ROUNDING */
+        left_shift  = p_out_shift[k] < 0 ? 0 : p_out_shift[k];
+        right_shift = p_out_shift[k] > 0 ? 0 : -p_out_shift[k];
+#endif /* #if TFLITE_SINGLE_ROUNDING */          
+        ae_int32x2 acc;
+#if XCHAL_HAVE_HIFI1
+        if(p_bias != NULL){
+           acc = AE_L32_I((ae_int32*)&p_bias[k], 0);
+        }
+        else{
+          acc = AE_MOVDA32(0);
+        }
+        MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift);
+        acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias));
+        acc = AE_MAX32(acc, min_int8);
+        acc = AE_MIN32(acc, max_int8);
+        AE_S8_0_X_HIFI1( AE_MOVINT16X4_FROMINT32X2(acc), (WORD8 *)p_out, (i * out_height_offset + j * out_width_offset + k * out_channels_offset));
+#else
+        if(p_bias != NULL){
+          acc = AE_MOVDA32(p_bias[k]);
+        }
+        else{
+          acc = AE_MOVDA32(0);
+        }
+        MPY_BY_QUANT_MULT_X2_OUT32(acc, acc, p_out_multiplier[k], left_shift, right_shift);
+        acc = AE_ADD32S(acc, AE_MOVDA32(out_zero_bias));
+#if 0
+        AE_MINMAX32(acc, min_int8, max_int8);
+#else
+        acc = AE_MAX32(acc, min_int8);
+        acc = AE_MIN32(acc, max_int8);
+#endif
+        p_out[i * out_height_offset + j * out_width_offset + k * out_channels_offset] = (UWORD8)AE_MOVAD32_L(acc);
+#endif
+      }
+    }
+  }
+  return out_width_over_x_r_pad;
+}
+
+#ifdef polyphase_debug
+#include<stdio.h>
+void writingoutput(WORD8* __restrict__ p_out_base, WORD32 out_height, WORD32 out_width,WORD32 out_channels )
+{
+	int i,j, count;
+	FILE * dataFilePr;
+	count = 0;
+	dataFilePr = fopen("C:/Users/hariev/Documents/file.txt", "w+");
+	for(i=0;i<out_height;i++)
+		for(j=0;j<out_width;j++)
+		{
+			fprintf(dataFilePr,"%d\n", *(p_out_base+count) );
+			count = count + out_channels;
+		}
+	fclose(dataFilePr);
+}
+void manipulateinput(void* p_inp, WORD32 input_height, WORD32 input_width, WORD32 input_channels, void* p_ker, WORD32 kernel_height, WORD32 kernel_width, WORD32 output_channels, void* p_bias, WORD32* p_out_multiplier, WORD32* p_out_shift, WORD32* out_zero_bias, WORD32* input_zero_bias)
+{
+	WORD8* p_inp_debug;
+	WORD8* p_ker_debug;
+	WORD32* p_bias_debug;
+
+	p_inp_debug  = (WORD8*)p_inp;
+	p_ker_debug  = (WORD8*)p_ker;
+	p_bias_debug = (WORD32*)p_bias;
+
+	WORD32 iter = 0, i, k, j1, j2;
+	for(k=0;k<input_height;k++)
+		for(i=0;i<input_width;i++)
+		{
+			for(j1=0;j1<input_channels;j1++)
+			{
+				*p_inp_debug = iter;//14*k + 2*i;//iter;
+				p_inp_debug++;
+			}
+			iter++;
+			if(iter==8)
+				iter = 0;
+		}
+
+	for(j2=0;j2<output_channels;j2++)
+		for(k=0;k<kernel_height;k++)
+			for(i=0;i<kernel_width;i++)
+			{
+				for(j1=0;j1<input_channels;j1++)
+				{
+
+					{
+						*p_ker_debug = 1;
+						//if( (k==0) && (i==0) && (j2==1))
+							//*p_ker_debug = 1;
+						p_ker_debug++;
+					}
+				}
+			}
+
+	for(k=0;k<output_channels;k++)
+	{
+		p_bias_debug[k] = 0;
+		p_out_multiplier[k] = 1073741823;//1073741823;///2147483647;
+		p_out_shift[k] = -2;
+	}
+
+	*out_zero_bias = 0;
+	*input_zero_bias = 0;
+
+}
+#endif
+
+static void xa_nn_rearrange_chw_to_hwc
+              (pWORD8 __restrict__ p_out
+              ,const WORD8*  __restrict__ p_inp
+              ,WORD32 height
+              ,WORD32 width
+              ,WORD32 channels
+              ) 
+{
+        const int inp_stride=width*height;
+        int itr_ch, itr_h, itr_w;
+        for(itr_h = 0; itr_h < height; itr_h++)
+        {
+          WORD8 *p_inp1 = (WORD8 *) p_inp+(itr_h*width);
+          for(itr_w = 0; itr_w < width; itr_w++)
+          {
+            WORD8 * __restrict__ p_inp2 = p_inp1+(itr_w*1);
+            //ae_valign a_out = AE_ZALIGN64();
+            for(itr_ch = 0; itr_ch < channels; itr_ch++)
+            {
+
+              WORD8 d0 = *(p_inp2);
+              p_inp2 += inp_stride;
+              *p_out++ = d0;
+              
+            }
+          }
+        }
+
+}
+
+WORD32 xa_nn_conv2d_per_chan_asym8xasym8(
+    UWORD8* __restrict__ p_out,
+    const UWORD8* __restrict__ p_inp,
+    const UWORD8* __restrict__ p_kernel,
+    const WORD32* __restrict__ p_bias,
+    WORD32 input_height,
+    WORD32 input_width,
+    WORD32 input_channels,
+    WORD32 kernel_height,
+    WORD32 kernel_width,
+    WORD32 kernel_channels,
+    WORD32 dilation_height,
+    WORD32 dilation_width,
+    WORD32 out_channels,
+    WORD32 x_stride,
+    WORD32 y_stride,
+    WORD32 x_padding,
+    WORD32 y_padding,
+    WORD32 out_height,
+    WORD32 out_width,
+    WORD32 input_zero_bias,
+    WORD32 * p_out_multiplier,
+    WORD32 * p_out_shift,
+    WORD32 out_zero_bias,
+    WORD32 out_data_format,
+    VOID *p_scratch)
+{
+   /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_kernel, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_scratch, -1);
+  
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(UWORD8), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_kernel, sizeof(UWORD8), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
+  
+  //XA_NNLIB_ARG_CHK_ALIGN(p_scratch, sizeof(WORD8), -1);
+  /* Basic Parameter checks */
+  XA_NNLIB_ARG_CHK_COND((input_height <= 0 || input_width <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((input_channels <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((kernel_channels <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((kernel_height <= 0 || kernel_width <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((out_channels <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((y_stride <= 0 || x_stride <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((y_padding < 0 || x_padding < 0), -1);
+  XA_NNLIB_ARG_CHK_COND((out_height <= 0 || out_width <= 0), -1);
+  XA_NNLIB_ARG_CHK_COND((input_zero_bias < -255 || input_zero_bias > 0), -1);
+  XA_NNLIB_ARG_CHK_COND((out_zero_bias < 0 || out_zero_bias > 255), -1);
+  XA_NNLIB_ARG_CHK_COND((out_data_format != 0 && out_data_format != 1), -1);
+  XA_NNLIB_ARG_CHK_COND((dilation_height != 1), -1);
+  XA_NNLIB_ARG_CHK_COND((dilation_width != 1), -1);
+
+  int itr;
+  for(itr=0;itr<out_channels;itr++){
+    XA_NNLIB_ARG_CHK_COND((p_out_shift[itr] < -31 || p_out_shift[itr] > 31), -1);
+  }
+
+  const int groups = input_channels/kernel_channels;
+  XA_NNLIB_ARG_CHK_COND((groups<=0), -1);
+  XA_NNLIB_ARG_CHK_COND(((input_channels %kernel_channels)!=0),-1);
+  XA_NNLIB_ARG_CHK_COND(((out_channels%groups)!=0),-1);
+  const int kernels_per_group = out_channels / groups;
+  XA_NNLIB_ARG_CHK_COND((kernels_per_group<=0),-1);
+  
+  int ret = 0;
+
+  WORD32 j;
+  WORD32 input_bytewidth = 1;
+  VOID *pp_inp = (VOID *)p_inp;
+  UWORD8* __restrict__ tmp_out;
+
+  p_scratch = ALIGNED_ADDR(p_scratch, ALIGNMENT);
+  xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_scratch;
+  WORD32 inp_h, inp_w, ker_h, ker_w, x_str, y_str, x_pad, y_pad, out_h, out_w;
+    
+  if ((input_height == 1) && (kernel_height == 1) && (out_height == 1))
+  {
+    inp_h = input_width;
+    inp_w = input_height;
+    ker_h = kernel_width;
+    ker_w = kernel_height;
+    x_str = y_stride;
+    y_str = x_stride;
+    x_pad = y_padding;
+    y_pad = x_padding;
+    out_h = out_width;
+    out_w = out_height;
+  }
+  else
+  {
+    inp_h = input_height;
+    inp_w = input_width;
+    ker_h = kernel_height;
+    ker_w = kernel_width;
+    x_str = x_stride;
+    y_str = y_stride;
+    x_pad = x_padding;
+    y_pad = y_padding;
+    out_h = out_height;
+    out_w = out_width;
+  }
+
+  WORD32 out_channels_offset = out_data_format ? out_h * out_w : 1;
+  WORD32 out_height_offset = out_data_format ? out_w : out_w * out_channels;
+  WORD32 out_width_offset = out_data_format ? 1 : out_channels;
+
+  WORD32 x_padding_var = x_pad;
+  WORD32 kernel_channels_pad;
+
+  kernel_channels_pad = PADDED_SIZE(kernel_channels, (ALIGNMENT >> 1));
+
+  /* When kernel convolves over x-left pad region only */
+  WORD32 out_width_over_x_pad = 0;
+  
+  if(x_padding_var >= ker_w)
+  {
+    out_width_over_x_pad = conv_x_left_pad(x_pad, ker_w, x_str, out_w, out_h, out_channels, out_channels_offset, out_width_offset, out_height_offset, p_bias, p_out, p_out_multiplier, p_out_shift, out_zero_bias);
+    x_padding_var -= out_width_over_x_pad * x_str;
+  }
+
+  /* When kernel convolves over x-right pad region only */
+  WORD32 out_width_over_x_r_pad = 0;
+  // Determine x-right padding
+  WORD32 x_r_pad = ker_w + (out_w - 1) * x_str - (x_pad + inp_w);
+  x_r_pad = x_r_pad < 0 ? 0 : x_r_pad;
+  if(x_r_pad >= ker_w)
+  {
+    out_width_over_x_r_pad = conv_x_right_pad(x_pad, inp_w, x_str, out_w, out_h, out_channels, out_channels_offset, out_width_offset, out_height_offset, p_bias, p_out, p_out_multiplier, p_out_shift, out_zero_bias);
+  }
+
+  /* When kernel convolves over input region */
+  p_out += out_width_over_x_pad * out_width_offset;
+  // Initialize circular buffer
+  // Determine y-bottom padding
+  WORD32 y_b_pad = ker_h + (out_h - 1) * y_str - (y_pad + inp_h);
+  y_b_pad = y_b_pad < 0 ? 0 : y_b_pad;
+  
+  xa_nn_conv2d_std_init_state((void*)p_state
+      ,(void*)p_kernel
+      ,inp_h
+      ,input_channels
+      ,ker_h
+      ,kernel_width
+      ,y_str
+      ,y_pad
+      ,out_h
+      ,out_channels
+      ,PREC_ASYM8U
+      ,PREC_ASYM8U);
+
+  for (int grp_i = 0; grp_i < groups; ++grp_i)
+  {
+    tmp_out=p_out+grp_i*kernels_per_group*out_channels_offset;
+    xa_nn_conv2d_group_init_state((void*)p_state
+        ,(void*)p_kernel
+        ,inp_h
+        ,kernel_channels
+        ,ker_h
+        ,ker_w
+        ,y_str
+        ,y_pad
+        ,out_h
+        ,out_channels
+        ,PREC_ASYM8U
+        ,PREC_ASYM8U);
+
+    pp_inp = (VOID *)(p_inp+grp_i*kernel_channels);
+    
+    conv2d_group_init_cir_buf(input_channels, kernel_channels_pad,kernel_channels,input_bytewidth, inp_w, inp_h, y_pad, y_b_pad, x_padding_var, ker_w, x_str, (VOID**)&pp_inp, p_state, -input_zero_bias);
+    
+      // Index to padded input width
+    WORD32 idx_beg_inp_width_pad = ker_w - x_str;
+    idx_beg_inp_width_pad = idx_beg_inp_width_pad < 0 ? 0 : idx_beg_inp_width_pad;
+  
+    // Process Loop to compute one output plane [out_h x out_channels] per iteration
+    for(j=0; j < out_w-out_width_over_x_pad-out_width_over_x_r_pad; j++)
+    {
+      // Add x_str x (inp_h x input_channels) new planes to circular buffer
+      conv2d_group_update_cir_buf(input_channels, kernel_channels_pad,kernel_channels,input_bytewidth, inp_w, inp_h, y_pad, y_b_pad, x_padding_var, ker_w, x_str, (VOID**)&pp_inp, idx_beg_inp_width_pad, p_state, -input_zero_bias);
+      
+      // Update index to input width padded
+      idx_beg_inp_width_pad += x_str;
+      
+      const WORD32 *p_bias_grp = NULL;
+      if(p_bias != NULL){
+        p_bias_grp = p_bias+grp_i*kernels_per_group;
+      }
+      
+      xa_nn_matXvec_asym8xasym8_asym8_circ
+        (tmp_out /* output */
+        ,p_state->cir_buf.p_curr/* matrix: rows x cols */
+        ,(p_state->p_kernel_padded+grp_i*kernels_per_group*kernel_channels_pad*ker_w*ker_h) /* vec: cols */
+        ,p_bias_grp/* bias */
+        ,out_h /* rows */
+        ,kernel_channels_pad * ker_w * ker_h /* cols */
+        ,kernel_channels_pad * ker_w * y_str/* row_offset */
+        ,kernels_per_group /* vec_count */
+        ,kernel_channels_pad * ker_w * ker_h /* vec_stride */
+        ,out_channels_offset /* out_col_offset */
+        ,out_height_offset /* out_row_offset */
+        ,input_zero_bias
+        ,0
+        ,p_out_multiplier[0]
+        ,p_out_shift[0]
+        ,out_zero_bias
+        );
+
+      tmp_out += out_width_offset;
+    }
+  }
+  
+  return 0;
+}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c
new file mode 100644
index 00000000000..f6820f21cc3
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_conv2d_std_circ_buf.c
@@ -0,0 +1,1580 @@
+#include "xa_nnlib_common.h"
+#include "xa_nnlib_common_macros.h"
+#include "xa_nn_conv2d_std_state.h"
+
+#ifndef ENABLE_SCRATCH_SIZE_API_ONLY
+VOID xa_nn_conv2d_std_init_state(
+    VOID *p_scratch,
+    VOID *p_kernel,
+    WORD32 input_height,
+    WORD32 input_channels,
+    WORD32 kernel_height,
+    WORD32 kernel_width,
+    WORD32 y_stride,
+    WORD32 y_padding,
+    WORD32 out_height,
+    WORD32 output_channels,
+    WORD32 input_precision,
+    WORD32 kernel_precision)
+{
+  WORD8 *p_mem = (WORD8 *)p_scratch;
+  xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_mem;
+  size_t input_size;
+  UWORD32 align_size;
+
+  switch(input_precision)
+  {
+    case 8:
+      input_size = sizeof(WORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    case -8:
+    case 16:
+      input_size = sizeof(WORD16);
+      align_size = ALIGNMENT>>1;
+      break;
+    case -1:
+      input_size = sizeof(WORD32);
+      align_size = ALIGNMENT>>2;
+      break;
+    case -3:
+      input_size = sizeof(UWORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    case -4:
+    case -5:
+      input_size = sizeof(WORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    default:
+      input_size = 0;
+      align_size = 0;
+      break;
+  }
+  p_mem += sizeof(xa_nn_conv_state_t);
+  p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT);
+
+  if(((UWORD32)p_kernel & BUS_WIDTH_MASK) == ((UWORD32)p_mem & BUS_WIDTH_MASK))
+  {
+    p_mem += BUS_WIDTH; /* Add a offset to avoid banking stall */
+  }
+
+  p_state->cir_buf.p_begin = p_mem;
+  p_state->cir_buf.p_curr = p_mem;
+
+  // Computing circular buffer size
+  // Determine y-bottom padding
+  WORD32 y_b_pad = kernel_height + (out_height - 1) * y_stride - (y_padding + input_height);
+  y_b_pad = y_b_pad < 0 ? 0 : y_b_pad;
+
+  WORD32 input_channels_pad;
+
+#if !ENABLE_PADDING_CONV2D_STD
+  if(input_precision == PREC_ASYM8S)
+  {
+    input_channels_pad = input_channels;
+  }
+  else
+#endif
+  {
+#if HW_AE_ADDCIRC16X4_XC 
+    /* Disbale padding for ic=1 (worst case scenario for performance), if hardware support exists.
+     * Enabled only for conv2d_std_sym8sxasym8s variant */
+    if(input_channels == 1 && kernel_precision == PREC_SYM8S && input_precision == PREC_ASYM8S)
+    {
+      input_channels_pad = 1;
+    }
+    else
+#endif
+    {
+      input_channels_pad = PADDED_SIZE(input_channels, align_size);
+    }
+  }
+
+  WORD32 cir_buf_size_bytes = (y_padding + input_height + y_b_pad) * kernel_width * input_channels_pad * input_size;
+  while(cir_buf_size_bytes%16 !=0)
+  {
+    cir_buf_size_bytes+= kernel_width*input_channels_pad*input_size;
+  }
+
+  p_mem += cir_buf_size_bytes;
+  p_state->cir_buf.p_end = p_mem;
+
+  AE_SETCBEGIN0(p_state->cir_buf.p_begin);
+  AE_SETCEND0(p_state->cir_buf.p_end);
+
+  p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT);
+
+  p_state->p_kernel_padded = (void *)p_kernel;
+
+#if !ENABLE_PADDING_CONV2D_STD
+  if(
+      (input_precision != PREC_ASYM8S) &&
+      (input_precision != PREC_F32) &&
+      (input_precision != PREC_16) &&
+      (input_channels_pad != input_channels)
+    )
+#else
+  if(
+      (input_precision != PREC_16) &&
+      (input_channels_pad != input_channels)
+    )
+#endif
+  {
+    int oc, kh, kw, kernel_size;
+    p_state->p_kernel_padded = (void *)p_mem;
+
+    switch(kernel_precision)
+    {
+      case 8:
+        kernel_size = sizeof(WORD8);
+        break;
+      case 16:
+        kernel_size = sizeof(WORD16);
+        break;
+      case -1:
+        kernel_size = sizeof(WORD32);
+        break;
+      case -3:
+        kernel_size = sizeof(UWORD8);
+        break;
+      case -4:
+      case -5:
+        kernel_size = sizeof(WORD8);
+        break;
+      default:
+        kernel_size = 0;
+        break;
+    }
+
+    memset(p_mem, 0, output_channels*kernel_height*kernel_width*input_channels_pad*kernel_size);
+    pWORD8 p_src = (pWORD8) p_kernel;
+    pWORD8 p_dst = (pWORD8) p_state->p_kernel_padded;
+
+    for(oc = 0; oc < output_channels; oc++)
+    for(kh = 0; kh < kernel_height; kh++)
+    {
+      for(kw = 0; kw < kernel_width; kw++)
+      {
+//        memcpy(p_dst, p_src, kernel_size * input_channels);
+        for(int ii=0; ii<kernel_size * input_channels; ii++){
+          p_dst[ii] = p_src[ii];
+        }
+        p_dst += kernel_size * input_channels_pad;
+        p_src += kernel_size * input_channels;
+      }
+    }
+  }
+
+}
+
+VOID xa_nn_conv2d_group_init_state(
+    VOID *p_scratch,
+    VOID *p_kernel,
+    WORD32 input_height,
+    WORD32 kernel_channels,
+    WORD32 kernel_height,
+    WORD32 kernel_width,
+    WORD32 y_stride,
+    WORD32 y_padding,
+    WORD32 out_height,
+    WORD32 output_channels,
+    WORD32 input_precision,
+    WORD32 kernel_precision)
+{
+  WORD8 *p_mem = (WORD8 *)p_scratch;
+  xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_mem;
+  size_t input_size;
+  UWORD32 align_size;
+
+  switch(input_precision)
+  {
+    case 8:
+      input_size = sizeof(WORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    case -8:
+    case 16:
+      input_size = sizeof(WORD16);
+      align_size = ALIGNMENT>>1;
+      break;
+    case -1:
+      input_size = sizeof(WORD32);
+      align_size = ALIGNMENT>>2;
+      break;
+    case -3:
+      input_size = sizeof(UWORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    case -4:
+    case -5:
+      input_size = sizeof(WORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    default:
+      input_size = 0;
+      align_size = 0;
+      break;
+  }
+  p_mem += sizeof(xa_nn_conv_state_t);
+  p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT);
+
+  if(((UWORD32)p_kernel & BUS_WIDTH_MASK) == ((UWORD32)p_mem & BUS_WIDTH_MASK))
+  {
+    p_mem += BUS_WIDTH; /* Add a offset to avoid banking stall */
+  }
+
+  p_state->cir_buf.p_begin = p_mem;
+  p_state->cir_buf.p_curr = p_mem;
+
+  // Computing circular buffer size
+  // Determine y-bottom padding
+  WORD32 y_b_pad = kernel_height + (out_height - 1) * y_stride - (y_padding + input_height);
+  y_b_pad = y_b_pad < 0 ? 0 : y_b_pad;
+
+  WORD32 kernel_channels_pad;
+
+#if !ENABLE_PADDING_CONV2D_STD
+  if(input_precision == PREC_ASYM8S)
+  {
+    kernel_channels_pad = kernel_channels;
+  }
+  else
+#endif
+  {
+#if HW_AE_ADDCIRC16X4_XC 
+    /* Disbale padding for ic=1 (worst case scenario for performance), if hardware support exists.
+     * Enabled only for conv2d_std_sym8sxasym8s variant */
+    if(kernel_channels == 1 && kernel_precision == PREC_SYM8S && input_precision == PREC_ASYM8S)
+    {
+      kernel_channels_pad = 1;
+    }
+    else
+#endif
+    {
+      kernel_channels_pad = PADDED_SIZE(kernel_channels, align_size);
+    }
+  }
+
+  WORD32 cir_buf_size_bytes = (y_padding + input_height + y_b_pad) * kernel_width * kernel_channels_pad * input_size;
+  while(cir_buf_size_bytes%16 !=0)
+  {
+    cir_buf_size_bytes+= kernel_width*kernel_channels_pad*input_size;
+  }
+
+  p_mem += cir_buf_size_bytes;
+  p_state->cir_buf.p_end = p_mem;
+
+  AE_SETCBEGIN0(p_state->cir_buf.p_begin);
+  AE_SETCEND0(p_state->cir_buf.p_end);
+
+}
+
+VOID xa_nn_conv2d_dilation_init_state(
+    VOID *p_scratch,
+    VOID *p_kernel,
+    VOID *p_input)
+{
+	WORD8 *p_mem = (WORD8 *)p_scratch;
+	xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_mem;
+
+	  p_mem += sizeof(xa_nn_conv_state_t);
+	  p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT);
+
+
+	  if(((UWORD32)p_kernel & BUS_WIDTH_MASK) == ((UWORD32)p_mem & BUS_WIDTH_MASK))
+	  {
+	    p_mem += BUS_WIDTH; /* Add a offset to avoid banking stall */
+	  }
+	  p_state->cir_buf.p_base = p_mem;
+	  p_state->p_inp_base = p_input;
+}
+
+
+VOID xa_nn_dilated_conv2d_std_init_circ_buf(
+    VOID *p_scratch,
+    VOID *p_kernel,
+    WORD32 input_height,
+    WORD32 input_channels,
+    WORD32 kernel_height_dilation,
+    WORD32 kernel_width,
+    WORD32 y_stride,
+    WORD32 y_padding,
+    WORD32 out_height,
+    WORD32 output_channels,
+    WORD32 dilation_height,
+    WORD32 dilation_h_offset,
+    WORD32 input_precision,
+    WORD32 kernel_precision)
+{
+  WORD8 *p_mem;// = (WORD8 *)p_scratch;
+  xa_nn_conv_state_t *p_state = (xa_nn_conv_state_t *)p_scratch;
+  size_t input_size = 0;
+  UWORD32 align_size = 0;
+  WORD32 input_channels_pad;
+
+  switch(input_precision)
+  {
+    case 8:
+    case -4:
+      input_size = sizeof(WORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    case 16:
+      input_size = sizeof(WORD16);
+      align_size = ALIGNMENT>>1;
+      break;
+    case -1:
+      input_size = sizeof(WORD32);
+      align_size = ALIGNMENT>>2;
+      break;
+    case -3:
+      input_size = sizeof(UWORD8);
+      align_size = ALIGNMENT>>1;
+      break;
+    default:
+      break;
+  }
+
+  p_state->cir_buf.p_begin = p_state->cir_buf.p_base;
+  p_state->cir_buf.p_curr = p_state->cir_buf.p_begin;
+
+  p_mem = p_state->cir_buf.p_begin;
+
+  // Computing circular buffer size
+  // Determine y-bottom padding
+#if !ENABLE_PADDING_CONV2D_STD
+  if(input_precision == PREC_8 || input_precision == PREC_ASYM8U || input_precision == PREC_ASYM8S) //TODO: remove the condition when the padding requirement is removed for other variants.
+    input_channels_pad = input_channels;
+  else
+#endif
+    input_channels_pad = PADDED_SIZE(input_channels, align_size);
+
+  // calculate height for this offset case
+  WORD32 y_b_pad_total = kernel_height_dilation + (out_height - 1) * y_stride - (y_padding + input_height);
+  y_b_pad_total = y_b_pad_total < 0 ? 0 : y_b_pad_total;
+
+  WORD32 total_height = (y_padding + input_height + y_b_pad_total);
+  WORD32 height = (total_height/dilation_height) + (WORD32) (((total_height%dilation_height)-1)>=dilation_h_offset);
+
+  WORD32 cir_buf_size_bytes = height * kernel_width * input_channels_pad * input_size;
+
+  while(cir_buf_size_bytes%16 !=0)
+  {
+      cir_buf_size_bytes+= kernel_width*input_channels_pad*input_size;
+  }
+
+  p_mem += cir_buf_size_bytes;
+  p_state->cir_buf.p_end = p_mem;
+
+  AE_SETCBEGIN0(p_state->cir_buf.p_begin);
+  AE_SETCEND0(p_state->cir_buf.p_end);
+  
+  p_mem = ALIGNED_ADDR(p_mem, ALIGNMENT);
+
+  p_state->p_kernel_padded = (void *)p_kernel;
+
+#if !ENABLE_PADDING_CONV2D_STD
+  if( (input_precision != PREC_ASYM8S) &&
+      (input_precision != PREC_F32) &&
+      (input_precision != PREC_16) &&
+      (input_channels_pad != input_channels) )
+#else
+  if( (input_precision != PREC_F32) &&
+      (input_precision != PREC_16) &&
+      (input_channels_pad != input_channels) )
+#endif
+  {
+    int oc, kh, kw, kernel_size;
+    p_state->p_kernel_padded = (void *)p_mem;
+
+    switch(kernel_precision)
+    {
+      case 8:
+        kernel_size = sizeof(WORD8);
+        break;
+      case 16:
+        kernel_size = sizeof(WORD16);
+        break;
+      case -1:
+        kernel_size = sizeof(WORD32);
+        break;
+      case -3:
+        kernel_size = sizeof(UWORD8);
+        break;
+      case -4:
+      case -5:
+        kernel_size = sizeof(WORD8);
+        break;
+      default:
+        kernel_size = 0;
+        break;
+    }
+
+    pWORD8 p_src = (pWORD8) p_kernel;
+    pWORD8 p_dst = (pWORD8) p_state->p_kernel_padded;
+
+    for(oc = 0; oc < output_channels; oc++)
+    {
+      for(kh = 0; kh < kernel_height_dilation; kh++)
+      {
+        for(kw = 0; kw < kernel_width; kw++)
+        {
+          memcpy(p_dst, p_src, kernel_size * input_channels);
+          p_dst += kernel_size * input_channels;
+          p_src += kernel_size * input_channels;
+      
+          memset(p_dst, 0, kernel_size * (input_channels_pad - input_channels));
+          p_dst += kernel_size * (input_channels_pad - input_channels);
+        }
+      }
+    }
+  }
+}
+
+VOID conv2d_std_init_cir_buf(
+    WORD32 input_channels,
+    WORD32 input_channels_pad,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    xa_nn_conv_state_t *p_state)
+{
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  WORD32 planes_to_add = x_stride > kernel_width ? 0 : kernel_width - x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+
+  // Initialize circular buffer
+  // Set first 'y_padding' rows of cir_buf to zero
+  for(i=0;i<y_padding;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, 0, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+
+  // Set next 'input_height' rows of cir_buf with zero and/or input data
+  WORD32 copy_x_pad_width = x_padding;
+  WORD32 copy_inp_width = 0;
+  WORD32 rem_copy_width = 0;
+  if(planes_to_add <= x_padding)
+  {
+    copy_x_pad_width = planes_to_add;
+  }
+  else
+  {
+    copy_inp_width = planes_to_add - x_padding;
+    rem_copy_width = XT_MAX(0, copy_inp_width - input_width);
+    copy_inp_width = XT_MIN(copy_inp_width, input_width);
+  }
+  for(i=0;i<input_height;i++)
+  {
+    for(k=0;k<copy_x_pad_width;k++)
+    {
+      memset(p_dst, 0, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    for(k=0;k<copy_inp_width;k++)
+    {
+      memcpy(p_dst, p_inp, input_channels * input_bytewidth);
+      memset(&p_dst[input_channels * input_bytewidth], 0, (input_channels_pad - input_channels) * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      p_inp += input_channels * input_bytewidth;
+    }
+    for(k=0;k<rem_copy_width;k++)
+    {
+      memset(p_dst, 0, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+    p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+  }
+
+  // Set last 'y_b_pad' rows of cir_buf to zero
+  for(i=0;i<y_b_pad;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, 0, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+  p_inp += (-input_height * input_width + copy_inp_width) * input_channels * input_bytewidth;
+  *pp_inp = (VOID *)p_inp;
+}
+
+VOID conv2d_group_init_cir_buf(
+    WORD32 input_channels,
+    WORD32 kernel_channels_pad,
+    WORD32 kernel_channels,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    xa_nn_conv_state_t *p_state,
+    WORD32 pad_val)
+{
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  WORD32 planes_to_add = x_stride > kernel_width ? 0 : kernel_width - x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  UWORD8 pad_val_u8 = (UWORD8)pad_val;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+
+  // Initialize circular buffer
+  // Set first 'y_padding' rows of cir_buf to zero
+  for(i=0;i<y_padding;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+  }
+
+  // Set next 'input_height' rows of cir_buf with zero and/or input data
+  WORD32 copy_x_pad_width = x_padding;
+  WORD32 copy_inp_width = 0;
+  WORD32 rem_copy_width = 0;
+  if(planes_to_add <= x_padding)
+  {
+    copy_x_pad_width = planes_to_add;
+  }
+  else
+  {
+    copy_inp_width = planes_to_add - x_padding;
+    rem_copy_width = XT_MAX(0, copy_inp_width - input_width);
+    copy_inp_width = XT_MIN(copy_inp_width, input_width);
+  }
+  for(i=0;i<input_height;i++)
+  {
+    for(k=0;k<copy_x_pad_width;k++)
+    {
+      memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+    }
+    for(k=0;k<copy_inp_width;k++)
+    {
+      memcpy(p_dst, p_inp, kernel_channels * input_bytewidth);
+      memset(&p_dst[kernel_channels * input_bytewidth], pad_val_u8, (kernel_channels_pad - kernel_channels) * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+      p_inp += input_channels * input_bytewidth;
+    }
+    for(k=0;k<rem_copy_width;k++)
+    {
+      memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+    p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+  }
+
+  // Set last 'y_b_pad' rows of cir_buf to zero
+  for(i=0;i<y_b_pad;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+  }
+  p_inp += (-input_height * input_width + copy_inp_width) * input_channels * input_bytewidth;
+  *pp_inp = (VOID *)p_inp;
+}
+
+// Add x_stride (but not more than kernel_width) x (input_height x input_channels) new planes to circular buffer
+// Slow version of conv2d_std_update_cir_buf with fewer requirements
+VOID conv2d_std_update_cir_buf_slow(
+    WORD32 input_channels,
+    WORD32 input_channels_pad,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    WORD32 idx_beg_inp_width_pad,
+    xa_nn_conv_state_t *p_state)
+{
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  WORD32 planes_to_add = x_stride > kernel_width ? kernel_width : x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+
+  // Copy 'planes_to_add' planes of data to circular buffer
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad * input_bytewidth);
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+
+  // Set first 'y_padding' rows of cir_buf to zero
+  for(i=0;i<y_padding;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, 0, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+
+  // Set next 'input_height' rows of cir_buf with zero (from x_padding) and/or input data and/or zero (from x-right padding)
+  WORD32 idx_end_inp_width_pad = idx_beg_inp_width_pad + planes_to_add;
+  WORD32 copy_x_pad_width = 0;
+  WORD32 copy_inp_width = 0;
+  WORD32 to_skip_inp_width = x_stride - planes_to_add;     // Non-zero for x_stride > kernel_width
+  WORD32 copy_x_r_pad_width = 0;
+  if(idx_beg_inp_width_pad < x_padding)
+  {
+    copy_x_pad_width = x_padding - idx_beg_inp_width_pad;
+    copy_inp_width = idx_end_inp_width_pad - x_padding;
+    copy_x_r_pad_width = XT_MAX(0, copy_inp_width - input_width);
+    copy_inp_width = XT_MIN(copy_inp_width, input_width);
+  }
+  else if(idx_end_inp_width_pad <= x_padding + input_width)
+  {
+    copy_inp_width = planes_to_add;
+  }
+  else if(idx_beg_inp_width_pad < x_padding + input_width)
+  {
+    copy_inp_width = x_padding + input_width - idx_beg_inp_width_pad;
+    copy_x_r_pad_width = idx_end_inp_width_pad - (x_padding + input_width);
+  }
+  else
+  {
+    copy_x_r_pad_width = planes_to_add;
+  }
+
+  const int size1 = input_channels * input_bytewidth;
+  const int size2 = (input_channels_pad - input_channels) * input_bytewidth;
+  const int size3 = input_channels_pad * input_bytewidth;
+  if( (size1 < 32) && (size2 < 32) && (size3 < 32)){
+  /* This case handle smaller sizes (<32) in which the functions memset/memcpy are not to be called */
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_x_pad_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size3; ii++) {
+          p_dst[ii] = 0;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      for(k=0;k<copy_inp_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size1; ii++){
+          p_dst[ii] = p_inp[ii];
+        }
+        for(ii=0; ii < size2; ii++) {
+          p_dst[size1 + ii] = 0;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      for(k=0;k<copy_x_r_pad_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size3; ii++) {
+          p_dst[ii] = 0;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  } else {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_x_pad_width;k++)
+      {
+        memset(p_dst, 0, input_channels_pad * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      for(k=0;k<copy_inp_width;k++)
+      {
+        memcpy(p_dst, p_inp, input_channels * input_bytewidth);
+        memset(&p_dst[input_channels * input_bytewidth], 0, (input_channels_pad - input_channels) * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      for(k=0;k<copy_x_r_pad_width;k++)
+      {
+        memset(p_dst, 0, input_channels_pad * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  }
+  p_inp += (-input_height * input_width + copy_inp_width + to_skip_inp_width) * input_channels * input_bytewidth;
+
+  // Set last 'y_b_pad' rows of cir_buf to zero
+  for(i=0;i<y_b_pad;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, 0, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+  *pp_inp = (VOID *)p_inp;
+}
+
+VOID conv2d_group_update_cir_buf_slow(
+    WORD32 input_channels,
+    WORD32 kernel_channels_pad,
+    WORD32 kernel_channels,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    WORD32 idx_beg_inp_width_pad,
+    xa_nn_conv_state_t *p_state,
+    WORD32 pad_val)
+{
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  UWORD8 pad_val_u8 = (UWORD8)pad_val;
+  WORD32 planes_to_add = x_stride > kernel_width ? kernel_width : x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+
+  // Copy 'planes_to_add' planes of data to circular buffer
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * kernel_channels_pad * input_bytewidth);
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+
+  // Set first 'y_padding' rows of cir_buf to zero
+  for(i=0;i<y_padding;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+  }
+
+  // Set next 'input_height' rows of cir_buf with zero (from x_padding) and/or input data and/or zero (from x-right padding)
+  WORD32 idx_end_inp_width_pad = idx_beg_inp_width_pad + planes_to_add;
+  WORD32 copy_x_pad_width = 0;
+  WORD32 copy_inp_width = 0;
+  WORD32 to_skip_inp_width = x_stride - planes_to_add;     // Non-zero for x_stride > kernel_width
+  WORD32 copy_x_r_pad_width = 0;
+  if(idx_beg_inp_width_pad < x_padding)
+  {
+    copy_x_pad_width = x_padding - idx_beg_inp_width_pad;
+    copy_inp_width = idx_end_inp_width_pad - x_padding;
+    copy_x_r_pad_width = XT_MAX(0, copy_inp_width - input_width);
+    copy_inp_width = XT_MIN(copy_inp_width, input_width);
+  }
+  else if(idx_end_inp_width_pad <= x_padding + input_width)
+  {
+    copy_inp_width = planes_to_add;
+  }
+  else if(idx_beg_inp_width_pad < x_padding + input_width)
+  {
+    copy_inp_width = x_padding + input_width - idx_beg_inp_width_pad;
+    copy_x_r_pad_width = idx_end_inp_width_pad - (x_padding + input_width);
+  }
+  else
+  {
+    copy_x_r_pad_width = planes_to_add;
+  }
+
+  const int size1 = kernel_channels * input_bytewidth;
+  const int size2 = (kernel_channels_pad - kernel_channels) * input_bytewidth;
+  const int size3 = kernel_channels_pad * input_bytewidth;
+
+  if ((kernel_channels <= 16) && ((kernel_channels_pad-kernel_channels) <= 16) && (kernel_channels_pad <= 16)) 
+  {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_x_pad_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size3; ii++) {
+          p_dst[ii] = pad_val_u8;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+      }
+      for(k=0;k<copy_inp_width;k++)
+      {
+        for (int j = 0; j < size1; ++j) {
+          p_dst[j] = p_inp[j];
+        }
+        for(int ii=0; ii < size2; ii++) {
+          p_dst[size1 + ii] = pad_val_u8;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      for(k=0;k<copy_x_r_pad_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size3; ii++) {
+          p_dst[ii] = pad_val_u8;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  
+  }
+  else 
+  {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_x_pad_width;k++)
+      {
+        memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+      }
+      for(k=0;k<copy_inp_width;k++)
+      {
+        memcpy(p_dst, p_inp, kernel_channels * input_bytewidth);
+        memset(&p_dst[kernel_channels * input_bytewidth], pad_val_u8, (kernel_channels_pad - kernel_channels) * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      for(k=0;k<copy_x_r_pad_width;k++)
+      {
+        memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  }
+  p_inp += (-input_height * input_width + copy_inp_width + to_skip_inp_width) * input_channels * input_bytewidth;
+
+  // Set last 'y_b_pad' rows of cir_buf to zero
+  for(i=0;i<y_b_pad;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, kernel_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+  }
+  *pp_inp = (VOID *)p_inp;
+}
+
+// Add x_stride (but not more than kernel_width) x (input_height x input_channels) new planes to circular buffer
+VOID conv2d_std_update_cir_buf(
+    WORD32 input_channels,
+    WORD32 input_channels_pad,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    WORD32 idx_beg_inp_width_pad,
+    xa_nn_conv_state_t *p_state)
+{
+  if (y_padding != 0 || y_b_pad != 0 || x_padding != 0) {
+    conv2d_std_update_cir_buf_slow(
+      input_channels,
+      input_channels_pad,
+      input_bytewidth,
+      input_width,
+      input_height,
+      y_padding,
+      y_b_pad,
+      x_padding,
+      kernel_width,
+      x_stride,
+      pp_inp,
+      idx_beg_inp_width_pad,
+      p_state
+    );
+    return;
+  }
+
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  WORD32 planes_to_add = x_stride > kernel_width ? kernel_width : x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+
+  // Copy 'planes_to_add' planes of data to circular buffer
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad * input_bytewidth);
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+
+  // Set next 'input_height' rows of cir_buf with zero (from x_padding) and/or input data and/or zero (from x-right padding)
+  //WORD32 idx_end_inp_width_pad = idx_beg_inp_width_pad + planes_to_add;
+  WORD32 copy_inp_width = planes_to_add;
+  WORD32 to_skip_inp_width = x_stride - planes_to_add;     // Non-zero for x_stride > kernel_width
+
+  int size = input_channels * input_bytewidth;
+  if (size <= 32) {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_inp_width;k++)
+      {
+        for (int j = 0; j < size; ++j) {
+          p_dst[j] = p_inp[j];
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  } else {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_inp_width;k++)
+      {
+        memcpy(p_dst, p_inp, input_channels * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  }
+  p_inp += (-input_height * input_width + copy_inp_width + to_skip_inp_width) * input_channels * input_bytewidth;
+
+  *pp_inp = (VOID *)p_inp;
+}
+
+VOID conv2d_group_update_cir_buf(
+    WORD32 input_channels,
+    WORD32 kernel_channels_pad,
+    WORD32 kernel_channels,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    WORD32 idx_beg_inp_width_pad,
+    xa_nn_conv_state_t *p_state,
+    WORD32 pad_val)
+{
+  if (y_padding != 0 || y_b_pad != 0 || x_padding != 0) {
+    conv2d_group_update_cir_buf_slow(
+      input_channels,
+      kernel_channels_pad,
+      kernel_channels,
+      input_bytewidth,
+      input_width,
+      input_height,
+      y_padding,
+      y_b_pad,
+      x_padding,
+      kernel_width,
+      x_stride,
+      pp_inp,
+      idx_beg_inp_width_pad,
+      p_state,
+      pad_val
+    );
+    return;
+  }
+
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  UWORD8 pad_val_u8 = (UWORD8)pad_val;
+  WORD32 planes_to_add = x_stride > kernel_width ? kernel_width : x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+
+  // Copy 'planes_to_add' planes of data to circular buffer
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * kernel_channels_pad * input_bytewidth);
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+
+  // Set next 'input_height' rows of cir_buf with zero (from x_padding) and/or input data and/or zero (from x-right padding)
+  //WORD32 idx_end_inp_width_pad = idx_beg_inp_width_pad + planes_to_add;
+  WORD32 copy_inp_width = planes_to_add;
+  WORD32 to_skip_inp_width = x_stride - planes_to_add;     // Non-zero for x_stride > kernel_width
+
+  int size = kernel_channels * input_bytewidth;
+  const int size2 = (kernel_channels_pad - kernel_channels) * input_bytewidth;
+  if ((kernel_channels <= 16) && ((kernel_channels_pad-kernel_channels) <= 16)) {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_inp_width;k++)
+      {
+        for (int j = 0; j < size; ++j) {
+          p_dst[j] = p_inp[j];
+        }
+        for(int ii=0; ii < size2; ii++) {
+          p_dst[size + ii] = pad_val_u8;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  } else {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_inp_width;k++)
+      {
+        memcpy(p_dst, p_inp, kernel_channels * input_bytewidth);
+        memset(&p_dst[kernel_channels * input_bytewidth], pad_val_u8, (kernel_channels_pad - kernel_channels) * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, kernel_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * kernel_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  }
+  p_inp += (-input_height * input_width + copy_inp_width + to_skip_inp_width) * input_channels * input_bytewidth;
+
+  *pp_inp = (VOID *)p_inp;
+
+}
+
+VOID xa_nn_dilated_conv2d_std_load_cir_buf_asym8(
+    WORD32 input_channels,
+    WORD32 input_channels_pad,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    VOID **pp_inp,
+    xa_nn_conv_state_t *p_state,
+    WORD32 pad_val,
+    WORD32 dilation_height,
+    WORD32 dilation_h_offset,
+    WORD32 dilation_width,
+    WORD32 dilation_w_offset,
+    WORD32 x_padding_full,
+    WORD32 *input_padding_consumed,
+    WORD32 *input_width_consumed,
+    WORD32 planes_to_add,
+    WORD32 firstCall,
+    WORD32 *circMatrixHeight,
+    WORD32 widthIndexIteration,
+    WORD32 x_stride_dilated,
+    WORD32 heightIndexIteration)
+{ 
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  //WORD32 planes_to_add = x_stride > kernel_width ? 0 : kernel_width - x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+  //ae_int8x8 zero_pad = AE_MOVDA8(pad_val);
+  UWORD8 pad_val_u8 = (UWORD8)pad_val;
+  //ae_int8x8 inp_val;
+  (void) input_bytewidth;
+  WORD32 y_padding_dilation;
+
+  if(!firstCall)
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad);
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad);
+
+  WORD32 indexCorrectionDoneInHeight = 1;
+  WORD32 heightIndexIterationModified = heightIndexIteration;
+  y_padding_dilation = (y_padding / dilation_height) + (WORD32)(((y_padding%dilation_height)-1)>=dilation_h_offset);
+  WORD32 y_padding_dilation_indexCorrected = y_padding_dilation - heightIndexIteration;
+  if(y_padding_dilation_indexCorrected<0)
+  {
+      indexCorrectionDoneInHeight = 0;
+      heightIndexIterationModified = -y_padding_dilation_indexCorrected;
+      y_padding_dilation_indexCorrected = 0;
+  }
+  *circMatrixHeight = 0;
+  *circMatrixHeight = *circMatrixHeight + y_padding_dilation_indexCorrected;
+  // Initialize circular buffer
+  /*if(input_channels == 1)
+  {
+    // Set first 'y_padding' rows of cir_buf to zero
+    for(i=0;i<y_padding;i++)
+    {
+      for(k=0;k<planes_to_add;k++)
+      {
+        AE_S8_0_XC(zero_pad, (ae_int8 *)p_dst, 1);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep);
+    }
+  }
+  else*/
+  {
+    // Set first 'y_padding' rows of cir_buf to zero
+    for(i=0;i<y_padding_dilation_indexCorrected;i++)
+    {
+      for(k=0;k<planes_to_add;k++)
+      {
+        memset(p_dst, pad_val_u8, input_channels_pad);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad);
+    }
+  }
+
+  // Set next 'input_height' rows of cir_buf with zero and/or input data
+    //estimate no.zeros for this offset
+
+  ///Calculate number x padding remaining for this width offset which can participate in the convolution process
+  WORD32 x_padding_full_dilation = (x_padding_full/dilation_width) + (WORD32) ( ((x_padding_full%dilation_width)-1) >= dilation_w_offset);//This is the contribution of zero padding(in total) towards this width offset
+  WORD32 x_padding_dilation_initial_pad = ((x_padding_full-x_padding)/dilation_width) + (WORD32) ( (((x_padding_full-x_padding)%dilation_width)-1) >= dilation_w_offset); /// This offset's contribution which has been absorbed in initial analysis of zero padding
+  WORD32 x_padding_dilation = x_padding_full_dilation - x_padding_dilation_initial_pad;//This is the num of zeros contribution from left padding for this dilation offset
+  WORD32 indexCorrectionDoneInWidth = 1;
+  WORD32 widthIndexIterationModified = widthIndexIteration;
+  //Accounting for initial width index/point in this sub-matrix for this offset (This arises from stride implementation)
+  WORD32 x_padding_dilation_postIndexCorrection = x_padding_dilation;// - widthIndexIteration;/// If this value lr. than zero implies first width-index inside this sub-matrix is inside input matrix after crossing left zero padding
+
+          x_padding_dilation_postIndexCorrection = x_padding_dilation_postIndexCorrection - widthIndexIteration;
+          if(x_padding_dilation_postIndexCorrection<0)
+          {
+              indexCorrectionDoneInWidth = 0;
+              widthIndexIterationModified = -x_padding_dilation_postIndexCorrection;
+              x_padding_dilation_postIndexCorrection = 0;
+          }
+          else
+          {
+              indexCorrectionDoneInWidth = 1;
+              widthIndexIterationModified = 0;
+          }
+
+  x_padding_dilation = x_padding_dilation_postIndexCorrection - (*input_padding_consumed); /// When this loop called repeatedly; some of the input will be consumed discounting for that
+
+  if(x_padding_dilation<0)
+      x_padding_dilation = 0;/// This condition can occur when we are done with zero padding section in the prev. iteration(can be first iteration in corner case)
+
+
+  /// Calculate number of input width/columns remaining for this width offset which can participate in the convolution process
+  WORD32 x_padding_plus_input_dilation = ( (x_padding_full+input_width)/dilation_width) + (WORD32) ( (((x_padding_full+input_width)%dilation_width)-1) >= dilation_w_offset);//This is the num elements to be convolved for this offset in total(zeropad+input)
+  WORD32 x_input_dilation = x_padding_plus_input_dilation - x_padding_full_dilation;// This is the number of elements from input that can potentially be populated
+  WORD32 x_input_dilation_postIndexCorrection;
+  WORD32 input_width_correction;
+
+  if(indexCorrectionDoneInWidth==0)
+  {
+      x_input_dilation_postIndexCorrection = x_input_dilation - widthIndexIterationModified; // this value if -ve correction flows towards right z.p
+      if(x_input_dilation_postIndexCorrection<0)
+      {
+          indexCorrectionDoneInWidth = 0;
+          widthIndexIterationModified = -x_input_dilation_postIndexCorrection;
+          x_input_dilation_postIndexCorrection = 0;
+          input_width_correction = x_input_dilation;
+      }
+      else
+      {
+          indexCorrectionDoneInWidth = 1;
+          input_width_correction = widthIndexIterationModified;
+          widthIndexIterationModified = 0;
+
+      }
+  }
+  else
+  {
+      x_input_dilation_postIndexCorrection = x_input_dilation;
+      input_width_correction = 0;
+  }
+
+  WORD32 x_input_dilation_postIndexCorrection_total = x_input_dilation_postIndexCorrection;/// This is the total convoble area after adjustng for stride offset for this dilation_offset
+  x_input_dilation_postIndexCorrection = x_input_dilation_postIndexCorrection - (*input_width_consumed);//consumedInput;/// When this loop called repeatedly; some of the input will be consumed discounting for that
+  if(x_input_dilation_postIndexCorrection<0)
+      x_input_dilation_postIndexCorrection = 0;/// This implies the control is to right padding
+
+  WORD32 copy_x_pad_width, copy_x_r_pad_width, copy_inp_width;
+
+  if(planes_to_add <= x_padding_dilation)
+  {
+    copy_x_pad_width = planes_to_add;
+    copy_inp_width = 0;
+    copy_x_r_pad_width = 0;
+  }
+  else if(planes_to_add <= (x_padding_dilation+x_input_dilation_postIndexCorrection) )
+  {
+      copy_x_pad_width = x_padding_dilation;
+      copy_inp_width = planes_to_add - copy_x_pad_width;
+      copy_x_r_pad_width = 0;
+  }
+  else
+  {
+      copy_x_pad_width = x_padding_dilation;
+      copy_inp_width = x_input_dilation_postIndexCorrection;
+      copy_x_r_pad_width = planes_to_add - (copy_x_pad_width+copy_inp_width) ;/// No need to calculate the right padding exactly as the loop outside i.e, calling function takes care of it
+  }
+
+  {
+    // estimate total number of height values for height_offset value from the input matrix
+    WORD32 input_padding_plus_height_dilation = ( (y_padding+input_height) / dilation_height) + (WORD32)((((y_padding+input_height)%dilation_height)-1)>=dilation_h_offset);
+    WORD32 input_height_dilation = input_padding_plus_height_dilation - y_padding_dilation;//y_padding_dilation; /// This value is the height of the circular matrix that has to be iterated for non-zero input values i.e., without top padding and bottim padding iterations
+    WORD32 input_height_dilation_indexCorrected = input_height_dilation;
+    WORD32 input_height_correction = 0;
+    if(indexCorrectionDoneInHeight==0)
+    {
+        input_height_dilation_indexCorrected = input_height_dilation_indexCorrected - heightIndexIterationModified;
+        if(input_height_dilation_indexCorrected<0)
+        {
+            indexCorrectionDoneInHeight = 0;
+            heightIndexIterationModified = -input_height_dilation_indexCorrected;
+            input_height_dilation_indexCorrected =  0;
+            input_height_correction = input_height_dilation;
+        }
+        else
+        {
+            indexCorrectionDoneInHeight = 1;
+            input_height_correction = heightIndexIterationModified;
+            heightIndexIterationModified = 0;
+
+        }
+    }
+    *circMatrixHeight = *circMatrixHeight + input_height_dilation_indexCorrected;
+
+    /// estimate the offset needed in the input matrix for this height offset
+    WORD32 index_0_input_dilation_height_offset =  (y_padding % dilation_height) ; ///This value represent 0th index in input matrix (post top padding) correspond to which offset in height's dilation scale
+    WORD32 input_offset_height_dilation = (dilation_h_offset - index_0_input_dilation_height_offset + dilation_height)%dilation_height;// "index_0_input_dilation_height_offset" represent the dilation offset corresponding to 0 th row of input but, the target is to reach "dilation_h_offset" in dilation scale. This calculation helps reach there from "index_0_input_dilation_height_offset"
+
+    p_inp = p_inp + (input_offset_height_dilation * input_width * input_channels); // This offsets the pointer as per the dilation offset in height dimension for stride=1. While supporting stride find the point inside sub matrix that is the starting point
+    p_inp = p_inp + (input_height_correction * dilation_height * input_width * input_channels);///This accounts for offset i.e., initial index that arises out of stride support
+    /// In the above calculation of pointer ystride is not brought into calculation, in height dimension Ystride will be handled by core convolution code
+
+    //for(i=0;i<input_height_dilation;i++)
+    for(i=0;i<input_height_dilation_indexCorrected;i++)
+    {
+      for(k=0;k<copy_x_pad_width;k++)
+      {
+        memset(p_dst, pad_val_u8, input_channels_pad);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad);
+      }
+      WORD32 index_0_input_dilation_offset =  (x_padding_full % dilation_width) ; ///This represent 0th index in input matrix correspond to which offset in dilation
+      WORD32 input_offset_dilation = (dilation_w_offset - index_0_input_dilation_offset + dilation_width)%dilation_width;// This is the offset corresponding to the present width offset
+      p_inp = p_inp + (  (input_offset_dilation + (*input_width_consumed)*dilation_width)   *input_channels);/// This is the offset corresponding
+      p_inp = p_inp + input_width_correction * dilation_width * input_channels;
+      // Pointer Offset in width dimension here does not have an exclusive mention as explained below:
+      // a) If stride value is smaller than the kernel then "planes_to_add" would be loaded with "stride" value outside the call. This data will begin from the width index after dropping xstride values. So, no need to exclusively mention this
+      // b) If stride value is gr. than kernel then "strideConsumption" would be accounted in the total consumed value and the next index would start appropriately accounting for stride in an indirect fashion
+
+
+      for(k=0;k<copy_inp_width;k++)
+      {
+        memcpy(p_dst, p_inp, input_channels);
+        memset(&p_dst[input_channels], pad_val_u8, (input_channels_pad - input_channels));
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad);
+        p_inp += (input_channels*dilation_width);
+      }
+      for(k=0;k<copy_x_r_pad_width;k++)
+      {
+          memset(p_dst, pad_val_u8, input_channels_pad);
+          AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad);
+      p_inp += ( (input_width - ((copy_inp_width*dilation_width)+(input_offset_dilation + (*input_width_consumed)*dilation_width) + (input_width_correction * dilation_width)  ) ) + ((dilation_height-1)*input_width) )* input_channels;
+
+    }
+
+    if ( (copy_inp_width >0) && (x_stride_dilated>kernel_width) )
+        *input_width_consumed = *input_width_consumed + x_stride_dilated - copy_x_pad_width;/// Account for stride consumption only if there was any consumption. Reduce whatever was consumed in left zp
+    else
+        *input_width_consumed = *input_width_consumed + copy_inp_width;
+
+    if(x_input_dilation_postIndexCorrection_total < (*input_width_consumed) )
+        *input_width_consumed = x_input_dilation_postIndexCorrection_total;
+
+
+    if ( (copy_x_pad_width >0) && (x_stride_dilated>kernel_width) )
+        *input_padding_consumed = *input_padding_consumed + x_stride_dilated ;
+    else
+        *input_padding_consumed = *input_padding_consumed + copy_x_pad_width ;
+
+    if(x_padding_dilation_postIndexCorrection <  (*input_padding_consumed) )
+        *input_padding_consumed = x_padding_dilation_postIndexCorrection;
+
+    /// Similar consumption calculation is not needed for right padding. This is because in right padding number of points will be lesser than kernel width as the outside function would have absolved all other right padding indices implying there would not be more than one call to fill right padding as a part of circular matrix loading
+
+
+    WORD32 input_height_toppadding_plus_input_plus_bottom_padding =  ((y_padding+input_height+y_b_pad) / dilation_height) + (WORD32)((((y_padding+input_height+y_b_pad)%dilation_height)-1)>=dilation_h_offset);// This is the total number of input points used for convolution for this height offset value
+    WORD32 y_b_pad_dilation = input_height_toppadding_plus_input_plus_bottom_padding - (y_padding_dilation+input_height_dilation);/// This calculates number of bottom padding points for this dilation offset i.e., dilation_h_offset
+
+    WORD32 input_bpadding_dilation_indexCorrected = y_b_pad_dilation;
+
+    if(indexCorrectionDoneInHeight==0)
+    {
+        input_bpadding_dilation_indexCorrected = input_bpadding_dilation_indexCorrected - heightIndexIterationModified;
+    }
+    *circMatrixHeight = *circMatrixHeight + input_bpadding_dilation_indexCorrected;
+    // Set last 'y_b_pad' rows of cir_buf to zero
+    for(i=0;i<input_bpadding_dilation_indexCorrected;i++)
+    {
+      for(k=0;k<planes_to_add;k++)
+      {
+        memset(p_dst, pad_val_u8, input_channels_pad);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad);
+    }
+
+  }
+}
+
+VOID conv2d_std_init_cir_buf_asym8(
+    WORD32 input_channels,
+    WORD32 input_channels_pad,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    xa_nn_conv_state_t *p_state,
+    WORD32 pad_val)
+{
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  WORD32 planes_to_add = x_stride > kernel_width ? 0 : kernel_width - x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  UWORD8 pad_val_u8 = (UWORD8)pad_val;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+
+  // Initialize circular buffer
+  // Set first 'y_padding' rows of cir_buf to zero
+  for(i=0;i<y_padding;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+
+  // Set next 'input_height' rows of cir_buf with zero and/or input data
+  WORD32 copy_x_pad_width = x_padding;
+  WORD32 copy_inp_width = 0;
+  WORD32 rem_copy_width = 0;
+  if(planes_to_add <= x_padding)
+  {
+    copy_x_pad_width = planes_to_add;
+  }
+  else
+  {
+    copy_inp_width = planes_to_add - x_padding;
+    rem_copy_width = XT_MAX(0, copy_inp_width - input_width);
+    copy_inp_width = XT_MIN(copy_inp_width, input_width);
+  }
+  for(i=0;i<input_height;i++)
+  {
+    for(k=0;k<copy_x_pad_width;k++)
+    {
+      memset(p_dst, pad_val_u8, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    for(k=0;k<copy_inp_width;k++)
+    {
+      memcpy(p_dst, p_inp, input_channels * input_bytewidth);
+      memset(&p_dst[input_channels * input_bytewidth], pad_val_u8, (input_channels_pad - input_channels) * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      p_inp += input_channels * input_bytewidth;
+    }
+    for(k=0;k<rem_copy_width;k++)
+    {
+      memset(p_dst, pad_val_u8, input_channels_pad);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+    p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+  }
+
+  // Set last 'y_b_pad' rows of cir_buf to zero
+  for(i=0;i<y_b_pad;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+  p_inp += (-input_height * input_width + copy_inp_width) * input_channels * input_bytewidth;
+  *pp_inp = (VOID *)p_inp;
+}
+
+// Add x_stride (but not more than kernel_width) x (input_height x input_channels) new planes to circular buffer
+VOID conv2d_std_update_cir_buf_asym8(
+    WORD32 input_channels,
+    WORD32 input_channels_pad,
+    WORD32 input_bytewidth,
+    WORD32 input_width,
+    WORD32 input_height,
+    WORD32 y_padding,
+    WORD32 y_b_pad,
+    WORD32 x_padding,
+    WORD32 kernel_width,
+    WORD32 x_stride,
+    VOID **pp_inp,
+    WORD32 idx_beg_inp_width_pad,
+    xa_nn_conv_state_t *p_state,
+    WORD32 pad_val)
+{
+  WORD32 i,k;
+  WORD8 *p_inp = (WORD8 *)*pp_inp;
+  UWORD8 pad_val_u8 = (UWORD8)pad_val;
+  WORD32 planes_to_add = x_stride > kernel_width ? kernel_width : x_stride;
+  WORD32 planes_to_keep = kernel_width - planes_to_add;
+
+  // Copy 'planes_to_add' planes of data to circular buffer
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_state->cir_buf.p_curr, planes_to_add * input_channels_pad * input_bytewidth);
+  WORD8 *p_dst = (WORD8 *)p_state->cir_buf.p_curr;
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+
+  // Set first 'y_padding' rows of cir_buf to zero
+  for(i=0;i<y_padding;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+
+  // Set next 'input_height' rows of cir_buf with zero (from x_padding) and/or input data and/or zero (from x-right padding)
+  WORD32 idx_end_inp_width_pad = idx_beg_inp_width_pad + planes_to_add;
+  WORD32 copy_x_pad_width = 0;
+  WORD32 copy_inp_width = 0;
+  WORD32 to_skip_inp_width = x_stride - planes_to_add;     // Non-zero for x_stride > kernel_width
+  WORD32 copy_x_r_pad_width = 0;
+  if(idx_beg_inp_width_pad < x_padding)
+  {
+    copy_x_pad_width = x_padding - idx_beg_inp_width_pad;
+    copy_inp_width = idx_end_inp_width_pad - x_padding;
+    copy_x_r_pad_width = XT_MAX(0, copy_inp_width - input_width);
+    copy_inp_width = XT_MIN(copy_inp_width, input_width);
+  }
+  else if(idx_end_inp_width_pad <= x_padding + input_width)
+  {
+    copy_inp_width = planes_to_add;
+  }
+  else if(idx_beg_inp_width_pad < x_padding + input_width)
+  {
+    copy_inp_width = x_padding + input_width - idx_beg_inp_width_pad;
+    copy_x_r_pad_width = idx_end_inp_width_pad - (x_padding + input_width);
+  }
+  else
+  {
+    copy_x_r_pad_width = planes_to_add;
+  }
+
+  const int size1 = input_channels * input_bytewidth;
+  const int size2 = (input_channels_pad - input_channels) * input_bytewidth;
+  const int size3 = input_channels_pad * input_bytewidth;
+  if( (size1 < 16) && (size2 < 16) && (size3 < 16)){
+  /* This case handle smaller sizes (<16) in which the functions memset/memcpy are not to be called */
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_x_pad_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size3; ii++) {
+          p_dst[ii] = pad_val_u8;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      for(k=0;k<copy_inp_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size1; ii++){
+          p_dst[ii] = p_inp[ii];
+        }
+        for(ii=0; ii < size2; ii++) {
+          p_dst[size1 + ii] = pad_val_u8;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      for(k=0;k<copy_x_r_pad_width;k++)
+      {
+        int ii;
+        for(ii = 0; ii < size3; ii++) {
+          p_dst[ii] = pad_val_u8;
+        }
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  } else {
+    for(i=0;i<input_height;i++)
+    {
+      for(k=0;k<copy_x_pad_width;k++)
+      {
+        memset(p_dst, pad_val_u8, input_channels_pad * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      for(k=0;k<copy_inp_width;k++)
+      {
+        memcpy(p_dst, p_inp, input_channels * input_bytewidth);
+        memset(&p_dst[input_channels * input_bytewidth], pad_val_u8, (input_channels_pad - input_channels) * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+        p_inp += input_channels * input_bytewidth;
+      }
+      for(k=0;k<copy_x_r_pad_width;k++)
+      {
+        memset(p_dst, pad_val_u8, input_channels_pad * input_bytewidth);
+        AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+      }
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+      p_inp += (input_width - copy_inp_width) * input_channels * input_bytewidth;
+    }
+  }
+
+  p_inp += (-input_height * input_width + copy_inp_width + to_skip_inp_width) * input_channels * input_bytewidth;
+
+  // Set last 'y_b_pad' rows of cir_buf to zero
+  for(i=0;i<y_b_pad;i++)
+  {
+    for(k=0;k<planes_to_add;k++)
+    {
+      memset(p_dst, pad_val_u8, input_channels_pad * input_bytewidth);
+      AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, input_channels_pad * input_bytewidth);
+    }
+    AE_ADDCIRC16X4_XC((ae_int16x4 *)p_dst, planes_to_keep * input_channels_pad * input_bytewidth);
+  }
+  *pp_inp = (VOID *)p_inp;
+}
+#endif // #ifndef ENABLE_SCRATCH_SIZE_API_ONLY
+
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_matXvec_asym8xasym8_asym8_circ.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_matXvec_asym8xasym8_asym8_circ.c
new file mode 100644
index 00000000000..28dd7a40bb2
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_matXvec_asym8xasym8_asym8_circ.c
@@ -0,0 +1,299 @@
+#include "xa_type_def.h"
+//#include "xa_nn_common.h"
+#include "xa_nn_conv2d_std_state.h"
+
+#include "xa_nnlib_common.h"
+#include "xa_nnlib_quant_macros.h"
+
+#define ZERO64  AE_ZERO64()
+
+#define ROW_UNROLL  4
+#define VEC_UNROLL  2
+
+#define SETUP_BIAS_BATCH_ASYM8b(idx_row, idx_vec) \
+  ae_int64 _ae_int64_sat_bias_ ##idx_row ##_ ##idx_vec = AE_SRAI64(AE_MOVINT64_FROMINT32X2(AE_MOVDA32(p_bias[vec_itr + idx_vec])), 32); \
+
+#define SETUP_BIAS_BATCH_ROW_ASYM8b(idx_row) \
+  SETUP_BIAS_BATCH_VEC_UNROLL(idx_row) \
+
+#define SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b SETUP_ACC_BATCH_VEC_UNROLL
+
+#define SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b(idx_row,idx_vec) \
+  ae_int64 _ae_int64_acc_ ##idx_row ##_ ##idx_vec = ZERO64; \
+
+#define SETUP_VEC_BATCH_ASYM8b(idx_vec) \
+  ae_int16x4 _ae_int16x4_vec_batch_ ##idx_vec  = AE_ZERO16(); \
+  WORD8 *_WORD8_p_vec_batch_ ##idx_vec  = (WORD8 *)(&p_vec1[(vec_itr + idx_vec)*vec_stride]); \
+
+#define SETUP_MAT1_ASYM8b(idx) \
+  ae_int16x4 _ae_int16x4_mat1_ ## idx = AE_ZERO16(); \
+  WORD8 *_WORD8_p_mat1_ ## idx = (WORD8 *) p_mat1; \
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)_WORD8_p_mat1_ ##idx, (m_itr+idx)*row_stride1); \
+
+#if XCHAL_HAVE_HIFI1
+#define LOAD_VEC_BATCH_ASYM8b(idx_vec) \
+  AE_L8X4U_IP(_ae_int16x4_vec_batch_ ##idx_vec, _WORD8_p_vec_batch_ ##idx_vec, 4*sizeof(WORD8)); \
+  _ae_int16x4_vec_batch_ ##idx_vec = AE_ADD16(_ae_int16x4_vec_batch_ ##idx_vec, AE_MOVDA16(vec1_offset));
+#define LOAD_ROW_MAT1_ASYM8b(idx_row) \
+  _ae_int16x4_mat1_ ##idx_row = AE_L8X4U_I(_WORD8_p_mat1_ ##idx_row, 0); \
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)_WORD8_p_mat1_ ##idx_row, 4*sizeof(WORD8)); \
+  _ae_int16x4_mat1_ ##idx_row = AE_ADD16(_ae_int16x4_mat1_ ##idx_row, AE_MOVDA16(mat1_offset));
+#else
+#define LOAD_VEC_BATCH_ASYM8b(idx_vec) \
+  AE_L8X4F_IP(_ae_int16x4_vec_batch_ ##idx_vec, _WORD8_p_vec_batch_ ##idx_vec, 4*sizeof(WORD8)); \
+  _ae_int16x4_vec_batch_ ##idx_vec  = AE_MOVF16X4_FROMF64(AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec_batch_ ##idx_vec), 8)); \
+  _ae_int16x4_vec_batch_ ##idx_vec = AE_ADD16(_ae_int16x4_vec_batch_ ##idx_vec, AE_MOVDA16(vec1_offset));
+#define LOAD_ROW_MAT1_ASYM8b(idx_row) \
+  _ae_int16x4_mat1_ ##idx_row = AE_L8X4F_I(_WORD8_p_mat1_ ##idx_row, 0); \
+  AE_ADDCIRC16X4_XC((ae_int16x4 *)_WORD8_p_mat1_ ##idx_row, 4*sizeof(WORD8)); \
+  _ae_int16x4_mat1_ ##idx_row = AE_MOVF16X4_FROMF64(AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_mat1_ ##idx_row), 8)); \
+  _ae_int16x4_mat1_ ##idx_row = AE_ADD16(_ae_int16x4_mat1_ ##idx_row, AE_MOVDA16(mat1_offset));
+#endif
+
+#define KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b(idx_row) \
+  KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row); \
+
+#define KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b(idx_row, idx_vec) \
+  AE_MULAAAAQ16(_ae_int64_acc_ ## idx_row ##_ ##idx_vec, _ae_int16x4_vec_batch_ ##idx_vec, _ae_int16x4_mat1_ ## idx_row); \
+
+#define ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx_row) \
+  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row); \
+
+#define ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx_row,idx_vec) \
+  _ae_int64_acc_ ##idx_row ##_ ##idx_vec = AE_ADD64S(_ae_int64_acc_ ##idx_row ##_ ##idx_vec, _ae_int64_sat_bias_ ##idx_row ##_ ##idx_vec); \
+
+/* Output scaling according to Tensorflow logic; following are steps:
+    1. If left_shift is to be done, do it in 32-bit without saturation
+    2. Multiply by out_multiplier: 32x32 multiplcation to 32 bit output
+    with asymmetric rounding and saturation
+    3. If right_shift is to be done, do it with symmetric rounding
+    4. Add out_offset */
+#define ADJUST_ACC_BATCH_ROW_ASYM8b(idx_row) \
+  ADJUST_ACC_BATCH_VEC_UNROLL(idx_row); \
+
+#if XCHAL_HAVE_HIFI1
+#define ADJUST_ACC_BATCH_ASYM8b(idx_row, idx_vec) \
+  ae_int32x2 _ae_int32x2_acc_ ##idx_row ##_ ##idx_vec = AE_SLAA32(AE_MOVINT32X2_FROMINT64(_ae_int64_acc_ ##idx_row ##_ ##idx_vec), left_shift); \
+  MPY_BY_QUANT_MULT_X2_OUT32(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec, AE_MOVINT32X2_FROMINT64(_ae_int64_acc_ ##idx_row ##_ ##idx_vec), out_multiplier, left_shift, right_shift); \
+  (_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec) = AE_ADD32S(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec, AE_MOVDA32(out_offset)); \
+
+
+#define STORE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row,idx_vec) \
+  _ae_int32x2_acc_ ##idx_row ##_ ##idx_vec = AE_MIN32(AE_MAX32(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec, AE_MOVDA32(0)), AE_MOVDA32(255)); \
+    AE_S8_0_I_HIFI1(AE_MOVINT16X4_FROMINT32X2(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec), ((WORD8 *)(&p_out[(vec_itr + idx_vec)*out_col_offset + (m_itr + idx_row)*out_row_offset])) , 0); \
+
+#else
+#define ADJUST_ACC_BATCH_ASYM8b(idx_row, idx_vec) \
+  ae_int32x2 _ae_int32x2_acc_ ##idx_row ##_ ##idx_vec; \
+  MPY_BY_QUANT_MULT_X2_OUT32(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec, AE_MOVINT32X2_FROMINT64(_ae_int64_acc_ ##idx_row ##_ ##idx_vec), out_multiplier, left_shift, right_shift); \
+  (_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec) = AE_ADD32S(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec, AE_MOVDA32(out_offset)); \
+
+#define STORE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row,idx_vec) \
+  _ae_int32x2_acc_ ##idx_row ##_ ##idx_vec = AE_MIN32(AE_MAX32(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec, AE_MOVDA32(0)), AE_MOVDA32(255)); \
+  (*((UWORD8 *) (&p_out[(vec_itr + idx_vec)*out_col_offset + (m_itr + idx_row)*out_row_offset]))) = (UWORD8)AE_MOVAD32_L(_ae_int32x2_acc_ ##idx_row ##_ ##idx_vec); \
+
+#endif
+
+/* Saturate result to unsigned 8 bit (0-255) and store */
+#define STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row) \
+  STORE_ACC_BATCH_VEC_UNROLL(idx_row); \
+
+#if (ROW_UNROLL == 1)
+#define SETUP_ACC            UNROLL_SETUP_ACC(0)
+#define SETUP_ROW_SUM_MAT1   UNROLL_SETUP_ROW_SUM_MAT1(0)
+#define SETUP_MAT1           UNROLL_SETUP_MAT1(0)
+
+#elif (ROW_UNROLL == 2)
+#define SETUP_ACC            UNROLL_SETUP_ACC(0)            UNROLL_SETUP_ACC(1)
+#define SETUP_ROW_SUM_MAT1   UNROLL_SETUP_ROW_SUM_MAT1(0)   UNROLL_SETUP_ROW_SUM_MAT1(1)
+#define SETUP_MAT1           UNROLL_SETUP_MAT1(0)           UNROLL_SETUP_MAT1(1)
+
+#elif (ROW_UNROLL == 4)
+#define SETUP_MAT1           UNROLL_SETUP_MAT1(0)           UNROLL_SETUP_MAT1(1)           UNROLL_SETUP_MAT1(2)           UNROLL_SETUP_MAT1(3)
+
+#elif (ROW_UNROLL == 8)
+#define SETUP_ACC            UNROLL_SETUP_ACC(0)            UNROLL_SETUP_ACC(1)            UNROLL_SETUP_ACC(2)            UNROLL_SETUP_ACC(3)            UNROLL_SETUP_ACC(4)            UNROLL_SETUP_ACC(5)            UNROLL_SETUP_ACC(6)            UNROLL_SETUP_ACC(7)
+#define SETUP_ROW_SUM_MAT1   UNROLL_SETUP_ROW_SUM_MAT1(0)   UNROLL_SETUP_ROW_SUM_MAT1(1)   UNROLL_SETUP_ROW_SUM_MAT1(2)   UNROLL_SETUP_ROW_SUM_MAT1(3)   UNROLL_SETUP_ROW_SUM_MAT1(4)   UNROLL_SETUP_ROW_SUM_MAT1(5)   UNROLL_SETUP_ROW_SUM_MAT1(6)   UNROLL_SETUP_ROW_SUM_MAT1(7)
+#define SETUP_MAT1           UNROLL_SETUP_MAT1(0)           UNROLL_SETUP_MAT1(1)           UNROLL_SETUP_MAT1(2)           UNROLL_SETUP_MAT1(3)           UNROLL_SETUP_MAT1(4)           UNROLL_SETUP_MAT1(5)           UNROLL_SETUP_MAT1(6)           UNROLL_SETUP_MAT1(7)
+
+#endif /* (ROW_UNROLL == 1) */
+
+#if (ROW_UNROLL == 4 && VEC_UNROLL == 2)
+
+#define SETUP_VEC_BATCH                             UNROLL_SETUP_VEC_BATCH(0)               UNROLL_SETUP_VEC_BATCH(1)
+
+#define SETUP_BIAS_BATCH                            UNROLL_ROW_SETUP_BIAS_BATCH(0)          UNROLL_ROW_SETUP_BIAS_BATCH(1)          UNROLL_ROW_SETUP_BIAS_BATCH(2)      UNROLL_ROW_SETUP_BIAS_BATCH(3)
+#define SETUP_BIAS_BATCH_VEC_UNROLL(idx_row)        UNROLL_SETUP_BIAS_BATCH(idx_row,0)      UNROLL_SETUP_BIAS_BATCH(idx_row,1)
+#define SETUP_BIAS_BATCH_TAIL                       UNROLL_SETUP_BIAS_BATCH(0,0)            UNROLL_SETUP_BIAS_BATCH(1,0)            UNROLL_SETUP_BIAS_BATCH(2,0)        UNROLL_SETUP_BIAS_BATCH(3,0)
+
+#define SETUP_ACC_BATCH                             UNROLL_ROW_SETUP_ACC_BATCH(0)           UNROLL_ROW_SETUP_ACC_BATCH(1)           UNROLL_ROW_SETUP_ACC_BATCH(2)       UNROLL_ROW_SETUP_ACC_BATCH(3)
+#define SETUP_ACC_BATCH_VEC_UNROLL(idx_row)         UNROLL_SETUP_ACC_BATCH(idx_row,0)       UNROLL_SETUP_ACC_BATCH(idx_row,1)
+#define SETUP_ACC_BATCH_TAIL                        UNROLL_SETUP_ACC_BATCH(0,0)             UNROLL_SETUP_ACC_BATCH(1,0)             UNROLL_SETUP_ACC_BATCH(2,0)         UNROLL_SETUP_ACC_BATCH(3,0)
+
+#define LOAD_VEC_BATCH                              UNROLL_LOAD_VEC_BATCH(0)                UNROLL_LOAD_VEC_BATCH(1)
+#define LOAD_MAT1                                   UNROLL_LOAD_ROW_MAT1(0)                 UNROLL_LOAD_ROW_MAT1(1)                 UNROLL_LOAD_ROW_MAT1(2)             UNROLL_LOAD_ROW_MAT1(3)
+
+#define KERNEL_MAT1_VEC_BATCH                       UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0)     UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(1)     UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(2) UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(3)
+#define KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row)   UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row,0) UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row,1)
+#define KERNEL_MAT1_VEC_BATCH_TAIL                  UNROLL_KERNEL_MAT1_VEC_BATCH(0,0)       UNROLL_KERNEL_MAT1_VEC_BATCH(1,0)       UNROLL_KERNEL_MAT1_VEC_BATCH(2,0)   UNROLL_KERNEL_MAT1_VEC_BATCH(3,0)
+
+#define ADD_BIAS_ACC_BATCH                          UNROLL_ROW_ADD_BIAS_ACC(0)              UNROLL_ROW_ADD_BIAS_ACC(1)              UNROLL_ROW_ADD_BIAS_ACC(2)          UNROLL_ROW_ADD_BIAS_ACC(3)
+#define ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row)      UNROLL_ADD_BIAS_ACC_BATCH(idx_row,0)    UNROLL_ADD_BIAS_ACC_BATCH(idx_row,1)
+#define ADD_BIAS_ACC_BATCH_TAIL                     UNROLL_ADD_BIAS_ACC_BATCH(0,0)          UNROLL_ADD_BIAS_ACC_BATCH(1,0)          UNROLL_ADD_BIAS_ACC_BATCH(2,0)      UNROLL_ADD_BIAS_ACC_BATCH(3,0)
+
+#define STORE_ACC_BATCH                             UNROLL_ROW_STORE_ACC(0)                 UNROLL_ROW_STORE_ACC(1)                 UNROLL_ROW_STORE_ACC(2)             UNROLL_ROW_STORE_ACC(3)
+#define STORE_ACC_BATCH_VEC_UNROLL(idx_row)         UNROLL_STORE_ACC_BATCH(idx_row,0)       UNROLL_STORE_ACC_BATCH(idx_row,1)
+#define STORE_ACC_BATCH_TAIL                        UNROLL_STORE_ACC_BATCH(0,0)             UNROLL_STORE_ACC_BATCH(1,0)             UNROLL_STORE_ACC_BATCH(2,0)         UNROLL_STORE_ACC_BATCH(3,0)
+
+#define ADJUST_ACC_BATCH                            UNROLL_ROW_ADJUST_ACC(0)                UNROLL_ROW_ADJUST_ACC(1)                UNROLL_ROW_ADJUST_ACC(2)            UNROLL_ROW_ADJUST_ACC(3)
+#define ADJUST_ACC_BATCH_VEC_UNROLL(idx_row)        UNROLL_ADJUST_ACC_BATCH(idx_row,0)      UNROLL_ADJUST_ACC_BATCH(idx_row,1)
+#define ADJUST_ACC_BATCH_TAIL                       UNROLL_ADJUST_ACC_BATCH(0, 0)           UNROLL_ADJUST_ACC_BATCH(1, 0)           UNROLL_ADJUST_ACC_BATCH(2, 0)       UNROLL_ADJUST_ACC_BATCH(3, 0)
+
+#endif /* (ROW_UNROLL == 4 && VEC_UNROLL == 2)*/
+
+
+WORD32 xa_nn_matXvec_asym8xasym8_asym8_circ(
+    UWORD8 * __restrict__ p_out,
+    UWORD8 * __restrict__ p_mat1,
+    const UWORD8 * __restrict__ p_vec1,
+    const WORD32 * __restrict__ p_bias,
+    WORD32 rows,
+    WORD32 cols1,
+    WORD32 row_stride1,
+    WORD32 vec_count,
+    WORD32 vec_stride,
+    WORD32 out_col_offset,
+    WORD32 out_row_offset,
+    WORD32 mat1_offset,
+    WORD32 vec1_offset,
+    WORD32 out_multiplier,
+    WORD32 out_shift,
+    WORD32 out_offset)
+{
+
+  /* Iterators used in for loops */
+  int m_itr, c_itr, vec_itr;
+  /* Shifts to match with Tensorflow */
+  int left_shift, right_shift;
+
+  if((out_shift > 31) || (out_shift < -31))
+  {
+    return -1;
+  }
+
+  if (!p_bias)
+  {
+    return -1;
+  }
+
+#define UNROLL_ROW_SETUP_ACC_BATCH              SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b
+#define UNROLL_SETUP_ACC_BATCH                  SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b
+#define UNROLL_SETUP_MAT1                       SETUP_MAT1_ASYM8b
+#define UNROLL_SETUP_VEC_BATCH                  SETUP_VEC_BATCH_ASYM8b
+#define UNROLL_ROW_SETUP_BIAS_BATCH             SETUP_BIAS_BATCH_ROW_ASYM8b
+#define UNROLL_SETUP_BIAS_BATCH                 SETUP_BIAS_BATCH_ASYM8b
+#define UNROLL_LOAD_VEC_BATCH                   LOAD_VEC_BATCH_ASYM8b
+#define UNROLL_LOAD_ROW_MAT1                    LOAD_ROW_MAT1_ASYM8b
+#define UNROLL_ROW_KERNEL_MAT1_VEC_BATCH        KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b
+#define UNROLL_KERNEL_MAT1_VEC_BATCH            KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b
+#define UNROLL_ROW_ADD_BIAS_ACC                 ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b
+#define UNROLL_ADD_BIAS_ACC_BATCH               ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b
+#define UNROLL_ROW_ADJUST_ACC                   ADJUST_ACC_BATCH_ROW_ASYM8b
+#define UNROLL_ADJUST_ACC_BATCH                 ADJUST_ACC_BATCH_ASYM8b
+#define UNROLL_ROW_STORE_ACC                    STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b
+#define UNROLL_STORE_ACC_BATCH                  STORE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b
+
+#if TFLITE_SINGLE_ROUNDING
+  left_shift = out_shift;
+  right_shift = out_shift;
+  /* Single rounding macro doesn't need two shifts so this is not used */
+  (void)right_shift;
+#else /* #if TFLITE_SINGLE_ROUNDING */
+  left_shift = out_shift<0?0:out_shift;
+  right_shift = out_shift>0?0:-out_shift;
+#endif /* #if TFLITE_SINGLE_ROUNDING */
+
+  if(p_mat1 && p_vec1)
+  {
+    for(vec_itr = 0; vec_itr < (vec_count & ~(VEC_UNROLL-1)); vec_itr+= VEC_UNROLL)
+    {
+      for(m_itr = 0; m_itr < (rows & ~(ROW_UNROLL-1)); m_itr += ROW_UNROLL)
+      {
+        SETUP_BIAS_BATCH;
+        SETUP_ACC_BATCH;
+        SETUP_VEC_BATCH;
+        SETUP_MAT1;
+        
+        for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++)
+        {
+          LOAD_VEC_BATCH;
+          LOAD_MAT1;
+          KERNEL_MAT1_VEC_BATCH;
+        }
+        ADD_BIAS_ACC_BATCH;
+        ADJUST_ACC_BATCH;
+        STORE_ACC_BATCH;
+      }
+      for(; m_itr < rows; m_itr++)
+      {
+        UNROLL_ROW_SETUP_BIAS_BATCH(0);
+        UNROLL_ROW_SETUP_ACC_BATCH(0);
+        SETUP_VEC_BATCH;
+        UNROLL_SETUP_MAT1(0);
+        
+        for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++)
+        {
+          LOAD_VEC_BATCH;
+          UNROLL_LOAD_ROW_MAT1(0);
+          UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0);
+        }
+        UNROLL_ROW_ADD_BIAS_ACC(0);
+        UNROLL_ROW_ADJUST_ACC(0);
+        UNROLL_ROW_STORE_ACC(0);
+      }
+    }
+    /* Tail loop for vec unroll */
+    for(; vec_itr < vec_count; vec_itr++)
+    {
+      for(m_itr = 0; m_itr < (rows & ~(ROW_UNROLL-1)); m_itr += ROW_UNROLL)
+      {
+        SETUP_BIAS_BATCH_TAIL;
+        SETUP_ACC_BATCH_TAIL;
+        UNROLL_SETUP_VEC_BATCH(0);
+        SETUP_MAT1;
+        
+        for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++)
+        {
+          UNROLL_LOAD_VEC_BATCH(0);
+          LOAD_MAT1;
+          KERNEL_MAT1_VEC_BATCH_TAIL;
+        }
+        ADD_BIAS_ACC_BATCH_TAIL;
+        ADJUST_ACC_BATCH_TAIL;
+        STORE_ACC_BATCH_TAIL;
+      }
+      for(; m_itr < rows; m_itr++)
+      {
+        UNROLL_SETUP_BIAS_BATCH(0,0);
+        UNROLL_SETUP_ACC_BATCH(0,0);
+        UNROLL_SETUP_VEC_BATCH(0);
+        UNROLL_SETUP_MAT1(0);
+        
+        for(c_itr = 0; c_itr < (cols1 >> 2); c_itr++)
+        {
+            UNROLL_LOAD_VEC_BATCH(0);
+            UNROLL_LOAD_ROW_MAT1(0);
+            UNROLL_KERNEL_MAT1_VEC_BATCH(0,0);
+        }
+        UNROLL_ADD_BIAS_ACC_BATCH(0,0);
+        UNROLL_ADJUST_ACC_BATCH(0,0);
+        UNROLL_STORE_ACC_BATCH(0,0);
+      }
+    }
+  }
+  else
+  {
+    return -1;
+  }
+  return 0;
+}
\ No newline at end of file