From d47aff4ec65b7b2386b8ecec0187e3e00ff847ce Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <ashutosh.parkhi@arm.com>
Date: Fri, 30 Sep 2022 09:23:28 +0100
Subject: [PATCH 1/3] [CMSIS-NN] Support for int16 conv2d

-Pattern matching and RelayToTIR introduce int16 support
-Added new context buffer size APIs for int16 Conv2d
-Added int16 variants to integration and buffer size tests

Change-Id: I6083ea20b9125a9700a69a93c52c07eb463618b2
---
 python/tvm/relay/op/contrib/cmsisnn.py        |  29 +++-
 .../backend/contrib/cmsisnn/buffer_size.cc    |  80 ++++++++++-
 .../backend/contrib/cmsisnn/buffer_size.h     |  36 ++++-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |  52 ++++---
 .../backend/contrib/cmsisnn/tir_to_runtime.cc |   4 +-
 .../contrib/cmsisnn/buffer_size_test.cc       |  86 +++++++++---
 .../contrib/test_cmsisnn/test_conv2d.py       | 130 +++++++++++-------
 tests/python/contrib/test_cmsisnn/utils.py    |  65 +++++----
 8 files changed, 352 insertions(+), 130 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index b887fafd7e00..1ec70195c9e0 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -121,11 +121,9 @@ def check_qnn_conv2d(pattern):
             requantize = pattern
         requantize_input = requantize.args[0]
         bias_add = None
-        bias_dtype = "int32"
         if str(requantize_input.op.name) == "nn.bias_add":
             bias_add = requantize_input
             conv2d = bias_add.args[0]
-            bias_dtype = bias_add.args[1].checked_type.dtype
         else:
             conv2d = requantize_input
         conv2d_input = conv2d.args[0]
@@ -145,12 +143,29 @@ def check_qnn_conv2d(pattern):
         ):
             is_depthwise = True
 
+        # check if dtypes are supported for the following entities
+        # (input_dtype, weight_dtype, bias_dtype, out_dtype, pattern_dtype)
+        are_dtypes_valid = False
+        if bias_add:
+            bias_dtype = bias_add.args[1].checked_type.dtype
+        else:
+            bias_dtype = "int32" if conv2d_input.checked_type.dtype == "int8" else "int64"
+        valid_dtypes = None
+        if conv2d_input.checked_type.dtype == "int8":
+            valid_dtypes = ("int8", "int8", "int32", "int32", "int8")
+        elif conv2d_input.checked_type.dtype == "int16":
+            valid_dtypes = ("int16", "int8", "int64", "int64", "int16")
+        if (
+            conv2d_input.checked_type.dtype,
+            conv2d_weight.checked_type.dtype,
+            bias_dtype,
+            conv2d.attrs.out_dtype,
+            pattern.checked_type.dtype,
+        ) == valid_dtypes:
+            are_dtypes_valid = True
+
         ret = (
-            conv2d.attrs.out_dtype == "int32"
-            and conv2d_input.checked_type.dtype == "int8"
-            and conv2d_weight.checked_type.dtype == "int8"
-            and pattern.checked_type.dtype == "int8"
-            and bias_dtype == "int32"
+            are_dtypes_valid
             and all([zp == 0 for zp in kernel_zp])
             and (not is_depthwise or bias_add is not None)
         )
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.cc b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
index 25f4d054e810..d5ac80cdfc26 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.cc
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
@@ -29,10 +29,27 @@ namespace relay {
 namespace contrib {
 namespace cmsisnn {
 
-int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
-                     int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
-                     int32_t filter_w, int32_t filter_h) {
+int Conv2dBufferSize(bool is_int16, Target target, int32_t padding_w, int32_t padding_h,
+                     int32_t input_n, int32_t input_h, int32_t input_c, int32_t output_h,
+                     int32_t output_w, int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                     int32_t dilation_h, int32_t filter_w, int32_t filter_h) {
+  int size = -1;
+  if (is_int16) {
+    size = Conv2dBufferSizeInt16(target, padding_w, padding_h, input_n, input_h, input_c, output_h,
+                                 output_w, stride_w, stride_h, dilation_w, dilation_h, filter_w,
+                                 filter_h);
+  } else {
+    size = Conv2dBufferSizeInt8(target, padding_w, padding_h, input_n, input_h, input_c, output_h,
+                                output_w, stride_w, stride_h, dilation_w, dilation_h, filter_w,
+                                filter_h);
+  }
+  return size;
+}
+
+int Conv2dBufferSizeInt8(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                         int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                         int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                         int32_t filter_w, int32_t filter_h) {
   bool is1x1 = (padding_w == 0) && (padding_h == 0) && (input_c % 4 == 0) && (stride_w == 1) &&
                (stride_h == 1) && (filter_w == 1) && (filter_h == 1) && (dilation_w == 1) &&
                (dilation_h == 1);
@@ -62,9 +79,38 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_
   return 0;
 }
 
-int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
-                              int32_t filter_w, int32_t filter_h, int32_t dilation_w,
-                              int32_t dilation_h) {
+int Conv2dBufferSizeInt16(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                          int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                          int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                          int32_t dilation_h, int32_t filter_w, int32_t filter_h) {
+  bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
+  bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
+
+  if (has_dsp && !has_mve) {
+    if ((filter_w * filter_h * input_c < 512) && dilation_w == 1 && dilation_h == 1) {
+      return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+    }
+  }
+  return 0;
+}
+
+int DepthwiseConv2dBufferSize(bool is_int16, Target target, int32_t input_n, int32_t input_c,
+                              int32_t output_c, int32_t filter_w, int32_t filter_h,
+                              int32_t dilation_w, int32_t dilation_h, int32_t depth_multiplier) {
+  int size = -1;
+  if (is_int16) {
+    size = DepthwiseConv2dBufferSizeInt16(target, input_n, input_c, output_c, filter_w, filter_h,
+                                          dilation_w, dilation_h, depth_multiplier);
+  } else {
+    size = DepthwiseConv2dBufferSizeInt8(target, input_n, input_c, output_c, filter_w, filter_h,
+                                         dilation_w, dilation_h, depth_multiplier);
+  }
+  return size;
+}
+
+int DepthwiseConv2dBufferSizeInt8(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
+                                  int32_t filter_w, int32_t filter_h, int32_t dilation_w,
+                                  int32_t dilation_h, int32_t depth_multiplier) {
   bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
   bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
 
@@ -78,6 +124,26 @@ int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, i
   return 0;
 }
 
+int DepthwiseConv2dBufferSizeInt16(Target target, int32_t input_n, int32_t input_c,
+                                   int32_t output_c, int32_t filter_w, int32_t filter_h,
+                                   int32_t dilation_w, int32_t dilation_h,
+                                   int32_t depth_multiplier) {
+  bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
+  bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
+
+  if (depth_multiplier == 1 && dilation_w == 1 && dilation_h == 1 &&
+      filter_w * filter_h * input_c < 512) {
+    if (has_dsp) {
+      if (has_mve) {
+        return 4 * input_c * filter_w * filter_h * (int32_t)sizeof(int16_t) + 8;
+      } else {
+        return input_c * filter_w * filter_h * (int32_t)sizeof(int16_t);
+      }
+    }
+  }
+  return 0;
+}
+
 int AvgPoolBufferSize(Target target, int32_t input_c) {
   bool has_mve = target->GetFeature<Bool>("has_mve").value_or(Bool(false));
   bool has_dsp = target->GetFeature<Bool>("has_dsp").value_or(Bool(false));
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.h b/src/relay/backend/contrib/cmsisnn/buffer_size.h
index 9dae17c0a220..5cf8c309cc5e 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.h
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.h
@@ -41,6 +41,7 @@ namespace cmsisnn {
  * See:
  * https://github.com/ARM-software/CMSIS_5/blob/8c60448c0e1e50e426180b26db9bc31ddf774361/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L108-L127
  *
+ * \param is_int16 - type of conv2d
  * \param target - CMSIS-NN Target
  * \param padding_w - Width padding
  * \param padding_h - Height padding
@@ -56,16 +57,27 @@ namespace cmsisnn {
  *
  * \return Size of buffer to allocate for convolution
  */
-int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
-                     int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
-                     int32_t filter_w, int32_t filter_h);
+int Conv2dBufferSize(bool is_int16, Target target, int32_t padding_w, int32_t padding_h,
+                     int32_t input_n, int32_t input_h, int32_t input_c, int32_t output_h,
+                     int32_t output_w, int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                     int32_t dilation_h, int32_t filter_w, int32_t filter_h);
+
+int Conv2dBufferSizeInt8(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                         int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                         int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                         int32_t filter_w, int32_t filter_h);
+
+int Conv2dBufferSizeInt16(Target target, int32_t padding_w, int32_t padding_h, int32_t input_n,
+                          int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
+                          int32_t stride_w, int32_t stride_h, int32_t dilation_w,
+                          int32_t dilation_h, int32_t filter_w, int32_t filter_h);
 
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Depthwise Convolutions
  * See:
  * https://github.com/ARM-software/CMSIS_5/blob/325443e52637b6c7eedbd160d238a6c462e89c9f/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c#L115-L129
  *
+ * \param is_int16 - type of conv2d
  * \param target - CMSIS-NN Target
  * \param input_n - Input batch size
  * \param input_c - Input channels
@@ -74,12 +86,22 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_
  * \param filter_h - Filter height
  * \param dilation_w - Dilation width
  * \param dilation_h - Dilation height
+ * \param depth_multiplier - Depth Multiplier for Depthwise Convolution
  *
  * \return Size of buffer to allocate for depthwise convolution
  */
-int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
-                              int32_t filter_w, int32_t filter_h, int32_t dilation_w,
-                              int32_t dilation_h);
+int DepthwiseConv2dBufferSize(bool is_int16, Target target, int32_t input_n, int32_t input_c,
+                              int32_t output_c, int32_t filter_w, int32_t filter_h,
+                              int32_t dilation_w, int32_t dilation_h, int32_t depth_multiplier);
+
+int DepthwiseConv2dBufferSizeInt8(Target target, int32_t input_n, int32_t input_c, int32_t output_c,
+                                  int32_t filter_w, int32_t filter_h, int32_t dilation_w,
+                                  int32_t dilation_h, int32_t depth_multiplier);
+
+int DepthwiseConv2dBufferSizeInt16(Target target, int32_t input_n, int32_t input_c,
+                                   int32_t output_c, int32_t filter_w, int32_t filter_h,
+                                   int32_t dilation_w, int32_t dilation_h,
+                                   int32_t depth_multiplier);
 
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Average Pooling
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index a5cdfd570fea..da51e6b762dd 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -93,17 +93,17 @@ class RelayToTIRVisitor : public MixedModeMutator {
                                const Map<tir::Var, tir::Buffer>& buffer_map,
                                tvm::Array<PrimExpr> call_extern_args,
                                PrimExpr context_buffer_var = PrimExpr(),
-                               int context_buffer_size = 0) {
+                               int context_buffer_size = 0, int num_bits = 8) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set(tvm::attr::kGlobalSymbol, global_var->name_hint);
     dict_attrs.Set(tvm::attr::kTarget, target_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
-        tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
+        tvm::tir::Call(DataType::Int(num_bits), tir::builtin::call_extern(), call_extern_args));
 
     if (context_buffer_size) {
-      body = tir::Allocate(Downcast<tir::Var>(context_buffer_var), DataType::Int(8),
+      body = tir::Allocate(Downcast<tir::Var>(context_buffer_var), DataType::Int(num_bits),
                            {context_buffer_size}, tir::const_true(), body);
     }
 
@@ -133,6 +133,22 @@ class RelayToTIRVisitor : public MixedModeMutator {
     } else {
       conv2d_call = requantize_input;
     }
+    int32_t dtype_bits = conv2d_call->args[0]->type_as<TensorTypeNode>()->dtype.bits();
+
+    // Determine bitwidth of buffers based on input dtype
+    int32_t input_bits = 8;
+    int32_t filter_bits = 8;
+    int32_t bias_bits = 32;
+    int32_t output_bits = 8;
+    int32_t context_buffer_bits = 8;
+    bool is_int16 = false;
+    if (dtype_bits == 16) {
+      is_int16 = true;
+      input_bits = 16;
+      bias_bits = 64;
+      output_bits = 16;
+      context_buffer_bits = 16;
+    }
 
     // TIR variables are created in the order they appear in the Relay partitioned function
     // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
@@ -145,14 +161,14 @@ class RelayToTIRVisitor : public MixedModeMutator {
     const int filter_scale_pos = 3;
     const int input_scale_pos = bias_add_call ? 5 : 4;
     BufferCreator buffer_creator;
-    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
-    tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(8));
+    tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(input_bits));
+    tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(filter_bits));
     tir::Var multiplier = buffer_creator.CreateBufferVar("multiplier", DataType::Handle(32));
     if (bias_add_call) {
-      buffer_creator.CreateBufferVar("bias", DataType::Handle(32));
+      buffer_creator.CreateBufferVar("bias", DataType::Handle(bias_bits));
     }
     tir::Var shift = buffer_creator.CreateBufferVar("shift", DataType::Handle(32));
-    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
+    tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(output_bits));
 
     // Relay function contains input_scale and filter_scale as function parameters at the following
     // locations in the global partitioned function for Conv2D
@@ -217,10 +233,10 @@ class RelayToTIRVisitor : public MixedModeMutator {
     scalar_args.push_back(ToArg(depth_multiplier));
 
     // original filter_layout for depthwise is HWOI
-    std::string cmsisnn_api = "arm_convolve_wrapper_s8";
+    std::string cmsisnn_api = is_int16 ? "arm_convolve_wrapper_s16" : "arm_convolve_wrapper_s8";
     bool is_depthwise = depth_multiplier != -1;
     if (is_depthwise) {
-      cmsisnn_api = "arm_depthwise_conv_wrapper_s8";
+      cmsisnn_api = is_int16 ? "arm_depthwise_conv_wrapper_s16" : "arm_depthwise_conv_wrapper_s8";
       int filter_pos_h = kernel_layout.find("H");
       int filter_pos_w = kernel_layout.find("W");
       Array<PrimExpr> depthwise_filter_shape{1, filter_shape[filter_pos_h],
@@ -242,18 +258,20 @@ class RelayToTIRVisitor : public MixedModeMutator {
     Target target = CreateTarget(transform::PassContext::Current());
     size_t context_buffer_size;
     if (is_depthwise) {
-      context_buffer_size = DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w,
-                                                      filter_h, dilation_w, dilation_h);
+      context_buffer_size =
+          DepthwiseConv2dBufferSize(is_int16, target, input_n, input_c, output_c, filter_w,
+                                    filter_h, dilation_w, dilation_h, depth_multiplier);
     } else {
-      context_buffer_size = Conv2dBufferSize(target, padding_w, padding_h, input_n, input_h,
-                                             input_c, output_h, output_w, stride_w, stride_h,
-                                             dilation_w, dilation_h, filter_w, filter_h);
+      context_buffer_size = Conv2dBufferSize(is_int16, target, padding_w, padding_h, input_n,
+                                             input_h, input_c, output_h, output_w, stride_w,
+                                             stride_h, dilation_w, dilation_h, filter_w, filter_h);
     }
 
     if (context_buffer_size) {
       String context_buffer_name = "context_buffer_" + std::to_string(context_buffer_id_++);
-      context_buffer_var = tir::Var(context_buffer_name,
-                                    PointerType(PrimType(DataType::Int(8)), "global.workspace"));
+      context_buffer_var =
+          tir::Var(context_buffer_name,
+                   PointerType(PrimType(DataType::Int(context_buffer_bits)), "global.workspace"));
     }
     tvm::Array<PrimExpr> context_buffer_args = {context_buffer_var, ToArg(context_buffer_size)};
 
@@ -266,7 +284,7 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
                             buffer_creator.GetBufferMap(), call_ext_args, context_buffer_var,
-                            context_buffer_size);
+                            context_buffer_size, context_buffer_bits);
   }
 
   void EmitFullyConnected(const GlobalVar& global_var, const Expr& expr) {
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index 50fa3821b7fa..ae9f195ca509 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -111,7 +111,9 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
         cmsis_func_name == "arm_elementwise_add_s8") {
       CodeGenC::VisitExpr_(op, os);
     } else if (cmsis_func_name == "arm_convolve_wrapper_s8" ||
-               cmsis_func_name == "arm_depthwise_conv_wrapper_s8") {
+               cmsis_func_name == "arm_convolve_wrapper_s16" ||
+               cmsis_func_name == "arm_depthwise_conv_wrapper_s8" ||
+               cmsis_func_name == "arm_depthwise_conv_wrapper_s16") {
       EmitConv2D(op);
     } else if (cmsis_func_name == "arm_fully_connected_s8") {
       EmitFullyConnected(op);
diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
index d8870fa71525..2094b70eb872 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
@@ -46,10 +46,10 @@ static const Target kNoExt("cmsis-nn -mcpu=cortex-m55 -mattr=+nodsp,+nomve");
 
 class CMSISNNCalculatedBufferSize : public testing::TestWithParam<std::array<int32_t, 3>> {};
 
-TEST(CMSISNNConv2dBufferSize, Conv1x1) {
+TEST(CMSISNNConv2dBufferSizeInt8, Conv1x1) {
   int32_t any = fake_parameters(gen);
   auto conv2d_1x1 = [=](Target target, int32_t input_c) {
-    return Conv2dBufferSize(target, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1, 1, 1);
+    return Conv2dBufferSizeInt8(target, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1, 1, 1);
   };
 
   ASSERT_EQ(conv2d_1x1(kNoExt, 4), 0);
@@ -71,7 +71,7 @@ TEST(CMSISNNConv2dBufferSize, Conv1x1) {
   ASSERT_EQ(conv2d_1x1(kHasMVE, 32), 0);
 }
 
-TEST(CMSISNNConv2dBufferSize, Conv1xN) {
+TEST(CMSISNNConv2dBufferSizeInt8, Conv1xN) {
   int32_t any = fake_parameters(gen);
   int32_t input_c = fake_parameters(gen);
   int32_t filter_w = fake_parameters(gen);
@@ -79,8 +79,8 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   int32_t calculated_buffer = (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
 
   auto conv2d_1xn = [=](Target target, int32_t output_w) {
-    return Conv2dBufferSize(target, any, any, 1, 1, input_c, 1, output_w, any, any, 1, 1, filter_w,
-                            filter_h);
+    return Conv2dBufferSizeInt8(target, any, any, 1, 1, input_c, 1, output_w, any, any, 1, 1,
+                                filter_w, filter_h);
   };
 
   ASSERT_EQ(conv2d_1xn(kNoExt, 4), calculated_buffer);
@@ -102,7 +102,7 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   ASSERT_EQ(conv2d_1xn(kHasMVE, 32), 0);
 }
 
-TEST(CMSISNNConv2dBufferSize, Default) {
+TEST(CMSISNNConv2dBufferSizeInt8, Default) {
   int32_t any = fake_parameters(gen);
 
   int32_t input_c = fake_parameters(gen);
@@ -114,8 +114,8 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   int32_t calculated_buffer_mve = 4 * col_length * 8 * (int32_t)sizeof(int8_t);
 
   auto conv2d = [=](Target target, int32_t output_w) {
-    return Conv2dBufferSize(target, any, any, 1, 1, input_c, 1, output_w, any, any, any, any,
-                            filter_w, filter_h);
+    return Conv2dBufferSizeInt8(target, any, any, 1, 1, input_c, 1, output_w, any, any, any, any,
+                                filter_w, filter_h);
   };
 
   ASSERT_EQ(conv2d(kNoExt, 4), calculated_buffer);
@@ -137,13 +137,39 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   ASSERT_EQ(conv2d(kHasMVE, 32), calculated_buffer_mve);
 }
 
-TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) {
+TEST(CMSISNNConv2dBufferSizeInt16, Default) {
+  int32_t any = fake_parameters(gen);
+
+  auto conv2d_int16_buffer = [=](Target target, int32_t input_c, int32_t filter_w,
+                                 int32_t filter_h) {
+    return Conv2dBufferSizeInt16(target, any, any, 1, 1, input_c, any, any, any, any, 1, 1,
+                                 filter_w, filter_h);
+  };
+
+  auto calculated_buffer = [=](int32_t input_c, int32_t filter_w, int32_t filter_h) {
+    return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+  };
+
+  ASSERT_EQ(conv2d_int16_buffer(kNoExt, 3, 5, 5), 0);
+  ASSERT_EQ(conv2d_int16_buffer(kNoExt, 32, 3, 3), 0);
+
+  ASSERT_EQ(conv2d_int16_buffer(kHasDSP, 3, 3, 3), calculated_buffer(3, 3, 3));
+  ASSERT_EQ(conv2d_int16_buffer(kHasDSP, 12, 5, 5), calculated_buffer(12, 5, 5));
+  ASSERT_EQ(conv2d_int16_buffer(kHasDSP, 24, 5, 5), 0);
+
+  ASSERT_EQ(conv2d_int16_buffer(kHasMVE, 3, 3, 3), 0);
+  ASSERT_EQ(conv2d_int16_buffer(kHasMVE, 12, 5, 5), 0);
+  ASSERT_EQ(conv2d_int16_buffer(kHasMVE, 24, 5, 5), 0);
+}
+
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt8, UnEvenChannels) {
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
   int32_t input_n = 1;
 
   auto depthwise_conv2d_with_channels = [=](Target target, int32_t input_c, int32_t output_c) {
-    return DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h, 1, 1);
+    return DepthwiseConv2dBufferSizeInt8(target, input_n, input_c, output_c, filter_w, filter_h, 1,
+                                         1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d_with_channels(kNoExt, 4, 6), 0);
@@ -154,14 +180,14 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) {
   ASSERT_EQ(depthwise_conv2d_with_channels(kHasMVE, 8, 7), 0);
 }
 
-TEST(CMSISNNDepthwiseConv2dBufferSize, MultipleBatches) {
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt8, MultipleBatches) {
   int32_t input_output_c = fake_parameters(gen);
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
 
   auto depthwise_conv2d_with_batch = [=](Target target, int32_t input_n) {
-    return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w,
-                                     filter_h, 1, 1);
+    return DepthwiseConv2dBufferSizeInt8(target, input_n, input_output_c, input_output_c, filter_w,
+                                         filter_h, 1, 1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d_with_batch(kNoExt, 4), 0);
@@ -172,7 +198,7 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, MultipleBatches) {
   ASSERT_EQ(depthwise_conv2d_with_batch(kHasMVE, 7), 0);
 }
 
-TEST(CMSISNNDepthwiseConv2dBufferSize, Default) {
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt8, Default) {
   int32_t input_output_c = fake_parameters(gen);
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
@@ -183,8 +209,8 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, Default) {
   int32_t dsp_calculated_buffer = (input_output_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
 
   auto depthwise_conv2d = [=](Target target) {
-    return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w,
-                                     filter_h, 1, 1);
+    return DepthwiseConv2dBufferSizeInt8(target, input_n, input_output_c, input_output_c, filter_w,
+                                         filter_h, 1, 1, 1);
   };
 
   ASSERT_EQ(depthwise_conv2d(kNoExt), 0);
@@ -195,6 +221,34 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, Default) {
   ASSERT_EQ(depthwise_conv2d(kHasMVE), mve_calculated_buffer);
 }
 
+TEST(CMSISNNDepthwiseConv2dBufferSizeInt16, Default) {
+  int32_t any = fake_parameters(gen);
+
+  auto depthwise_int16_buffer = [=](Target target, int32_t input_c, int32_t filter_w,
+                                    int32_t filter_h) {
+    return DepthwiseConv2dBufferSizeInt16(target, any, input_c, any, filter_w, filter_h, 1, 1, 1);
+  };
+
+  auto dsp_only_buffer = [=](int32_t input_c, int32_t filter_w, int32_t filter_h) {
+    return (input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+  };
+
+  auto dsp_mve_buffer = [=](int32_t input_c, int32_t filter_w, int32_t filter_h) {
+    return (4 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t) + 8;
+  };
+
+  ASSERT_EQ(depthwise_int16_buffer(kNoExt, 3, 5, 5), 0);
+  ASSERT_EQ(depthwise_int16_buffer(kNoExt, 32, 3, 3), 0);
+
+  ASSERT_EQ(depthwise_int16_buffer(kHasDSP, 3, 3, 3), dsp_only_buffer(3, 3, 3));
+  ASSERT_EQ(depthwise_int16_buffer(kHasDSP, 12, 5, 5), dsp_only_buffer(12, 5, 5));
+  ASSERT_EQ(depthwise_int16_buffer(kHasDSP, 24, 5, 5), 0);
+
+  ASSERT_EQ(depthwise_int16_buffer(kHasMVE, 3, 3, 3), dsp_mve_buffer(3, 3, 3));
+  ASSERT_EQ(depthwise_int16_buffer(kHasMVE, 12, 5, 5), dsp_mve_buffer(12, 5, 5));
+  ASSERT_EQ(depthwise_int16_buffer(kHasMVE, 24, 5, 5), 0);
+}
+
 TEST(CMSISNNAvgPoolBufferSize, Default) {
   int32_t input_c = fake_parameters(gen);
   int32_t calculated_buffer = (input_c * sizeof(int32_t));
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index d33d71261613..9ff55c952c88 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -36,6 +36,7 @@
     get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
+    get_kernel_bias_dtype,
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
@@ -59,8 +60,9 @@ def make_model(
     groups,
     dtype,
     kernel_dtype,
+    bias_dtype,
     out_channels,
-    weight_format,
+    kernel_layout,
     enable_bias,
     relu_type,
     input_op=None,
@@ -71,8 +73,8 @@ def make_model(
     else:
         op = relay.var("input", shape=shape, dtype=dtype)
 
-    h_index = weight_format.index("H")
-    w_index = weight_format.index("W")
+    h_index = kernel_layout.index("H")
+    w_index = kernel_layout.index("W")
     kernel_h = kernel_shape[h_index]
     kernel_w = kernel_shape[w_index]
     p = (0, 0, 0, 0)
@@ -80,7 +82,7 @@ def make_model(
         p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
 
     rng = np.random.default_rng(12321)
-    weight = tvm.nd.array(
+    kernel = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -88,27 +90,27 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(weight, kernel_dtype)
+    kernel_const = relay.const(kernel, kernel_dtype)
     conv2d_kernel_sc = kernel_scale[0] if out_channels == 1 else kernel_scale
     conv = relay.qnn.op.conv2d(
         op,
-        weight_const,
+        kernel_const,
         input_zero_point=relay.const(input_zero_point, "int32"),
         kernel_zero_point=relay.const(kernel_zero_point, "int32"),
         input_scale=relay.const(input_scale, "float32"),
         kernel_scale=relay.const(conv2d_kernel_sc, "float32"),
         kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
-        kernel_layout=weight_format,
+        kernel_layout=kernel_layout,
         dilation=dilation,
         strides=strides,
         groups=groups,
         channels=out_channels,
         padding=p,
-        out_dtype="int32",
+        out_dtype=bias_dtype,
     )
-    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(bias, "int32")
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype=bias_dtype))
+    bias_const = relay.const(bias, bias_dtype)
     last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv
     requant_input_sc = [sc * input_scale for sc in kernel_scale]
     requant_input_sc = requant_input_sc[0] if out_channels == 1 else requant_input_sc
@@ -121,7 +123,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": weight, "b": bias}
+    params = {"w": kernel, "b": bias}
     return last_op, params
 
 
@@ -150,7 +152,7 @@ def test_conv2d_number_primfunc_args(
     dilation = (1, 1)
     dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
@@ -158,6 +160,8 @@ def test_conv2d_number_primfunc_args(
     in_min, in_max = get_range_for_dtype_str(dtype)
     relu_type = "RELU"
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -165,7 +169,7 @@ def test_conv2d_number_primfunc_args(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -183,9 +187,10 @@ def test_conv2d_number_primfunc_args(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -220,6 +225,7 @@ def test_conv2d_number_primfunc_args(
 
 
 @tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("dtype", ["int8", "int16"])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("relu_type", ["RELU"])
 @pytest.mark.parametrize("enable_bias", [True, False])
@@ -230,7 +236,8 @@ def test_conv2d_number_primfunc_args(
 @pytest.mark.parametrize(
     "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
 )
-def test_conv2d_symmetric_padding_int8(
+def test_conv2d_symmetric_padding(
+    dtype,
     padding,
     enable_bias,
     relu_type,
@@ -249,15 +256,17 @@ def test_conv2d_symmetric_padding_int8(
     kernel_size = (3, 3)
     strides = (1, 1)
     dilation = (1, 1)
-    dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -265,7 +274,7 @@ def test_conv2d_symmetric_padding_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -283,9 +292,10 @@ def test_conv2d_symmetric_padding_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -321,7 +331,7 @@ def test_conv2d_symmetric_padding_int8(
     "input_zero_point, input_scale, kernel_scale, out_channels",
     [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
 )
-def test_conv2d_asymmetric_padding_int8(
+def test_conv2d_asymmetric_padding(
     padding,
     enable_bias,
     relu_type,
@@ -335,19 +345,22 @@ def test_conv2d_asymmetric_padding_int8(
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
 
+    dtype = "int8"
     ifm_shape = (1, 25, 25, 12)
     kernel_size = (5, 5)
     strides = (2, 2)
     dilation = (1, 1)
-    dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -355,7 +368,7 @@ def test_conv2d_asymmetric_padding_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -373,9 +386,10 @@ def test_conv2d_asymmetric_padding_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -434,13 +448,14 @@ def test_pad_conv2d_fusion_int8(
     kernel_scale = [0.11, 0.22]
     out_channels = 2
     groups = 1
-    weight_format = "HWIO"
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -448,7 +463,7 @@ def test_pad_conv2d_fusion_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -474,9 +489,10 @@ def test_pad_conv2d_fusion_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
         input_op=pad,
@@ -545,13 +561,15 @@ def test_invalid_pad_conv2d_fusion_int8(
     kernel_scale = [0.11, 0.22]
     out_channels = 2
     groups = 1
-    weight_format = "HWIO"
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -559,7 +577,7 @@ def test_invalid_pad_conv2d_fusion_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
     )
 
@@ -585,9 +603,10 @@ def test_invalid_pad_conv2d_fusion_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
         input_op=pad,
@@ -675,6 +694,7 @@ def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding,
 
 
 @tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("dtype", ["int8", "int16"])
 @pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize("kernel_size", [(3, 3)])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -691,7 +711,8 @@ def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding,
 @pytest.mark.parametrize(
     "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
 )
-def test_depthwise_int8(
+def test_depthwise(
+    dtype,
     ifm_shape,
     kernel_size,
     padding,
@@ -711,9 +732,9 @@ def test_depthwise_int8(
     interface_api = "c"
     use_unpacked_api = True
 
-    dtype = "int8"
     groups = 1
-    weight_format = "HWIO"
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
@@ -721,12 +742,14 @@ def test_depthwise_int8(
     in_min, in_max = get_range_for_dtype_str(dtype)
 
     groups = ifm_shape[3]
-    weight_format = "HWOI"
+    kernel_layout = "HWOI"
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier)
     out_channels = ifm_shape[3] * depth_multiplier
     ks_len = len(kernel_scale)
     kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)]
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -734,7 +757,7 @@ def test_depthwise_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
         is_depthwise=True,
     )
@@ -753,9 +776,10 @@ def test_depthwise_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -823,7 +847,8 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
 
     ifm_shape = (1, 24, 24, 1)
     groups = ifm_shape[3]
-    weight_format = "HWIO"
+    input_zero_point = input_zero_point if dtype == "int8" else 0
+    kernel_layout = "HWIO"
     (kernel_h, kernel_w) = (3, 3)
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier)
     out_channels = ifm_shape[3] * depth_multiplier
@@ -832,6 +857,8 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
     kernel_zero_point = 0
     kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)]
 
+    kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
+
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -839,7 +866,7 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
         kernel_scale,
         kernel_zero_point,
         input_dtype=dtype,
-        weights_dtype=dtype,
+        kernel_dtype=kernel_dtype,
         output_dtype=dtype,
         is_depthwise=True,
     )
@@ -858,9 +885,10 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
         dilation,
         groups,
         dtype,
-        dtype,
+        kernel_dtype,
+        bias_dtype,
         out_channels,
-        weight_format,
+        kernel_layout,
         enable_bias,
         relu_type,
     )
@@ -915,13 +943,15 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
 
 def parameterize_for_invalid_model(test):
     """Generates non int8 inputs"""
-    in_dtype = ["uint8", "int8"]
+    in_dtype = ["uint8", "int8", "int16"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
     all_combinations = itertools.product(in_dtype, kernel_dtype, kernel_zero_point)
     all_combinations = filter(
         lambda parameters: not (
-            parameters[0] == "int8" and parameters[1] == "int8" and parameters[2] == 0
+            (parameters[0] == "int8" or parameters[0] == "int16")
+            and parameters[1] == "int8"
+            and parameters[2] == 0
         ),
         all_combinations,
     )
@@ -947,6 +977,7 @@ def test_invalid_parameters(
 
     kernel_layout = "HWIO"
     kernel_shape = [3, 3, ifm_shape[3], out_channels]
+    _, bias_dtype = get_kernel_bias_dtype(in_dtype)
     output_scale, output_zero_point = get_conv2d_qnn_params(
         kernel_shape,
         input_scale,
@@ -973,8 +1004,9 @@ def test_invalid_parameters(
         groups=1,
         dtype=in_dtype,
         kernel_dtype=kernel_dtype,
+        bias_dtype=bias_dtype,
         out_channels=out_channels,
-        weight_format=kernel_layout,
+        kernel_layout=kernel_layout,
         enable_bias=True,
         relu_type="NONE",
     )
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 9fdb89289aff..f3a6b0c1343b 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -137,39 +137,52 @@ def get_same_padding(in_shape, kernel, dilation, stride):
     return [pad_top, pad_left, pad_bottom, pad_right]
 
 
+def get_kernel_bias_dtype(input_dtype):
+    """
+    Returns (kernel_dtype, bias_dtype) based on input's dtype.
+    """
+    # uint8 corresponds to an invalid case, so returning int types
+    # does not cause tests to break
+    if input_dtype in ("int8", "uint8"):
+        return ("int8", "int32")
+    elif input_dtype == "int16":
+        return ("int8", "int64")
+    raise ValueError("Invalid dtype provided to get_kernel_bias_dtype()")
+
+
 def get_conv2d_qnn_params(
-    weight_shape: List[int],
+    kernel_shape: List[int],
     input_scale: float,
     input_zp: int,
-    weights_scale: Union[float, List[float]],
-    weights_zp: int,
+    kernel_scale: Union[float, List[float]],
+    kernel_zp: int,
     input_dtype: str = "int8",
-    weights_dtype: str = "int8",
+    kernel_dtype: str = "int8",
     output_dtype: str = "int8",
     is_depthwise: bool = False,
 ) -> Tuple[float, int]:
     """
     Calculate the output quantization parameters for convolution based on the input and
-    weights quantization paramters and the data types.
+    kernel quantization paramters and the data types.
 
     Parameters
     ----------
-    weight_shape : List[int]
-        shape of the weights
+    kernel_shape : List[int]
+        shape of the kernel
     input_scale : float
         scale of the input tensor
     input_zp : int
         zero point of the input tensor
-    weights_scale : Union[float, List[float]]
-        scale(s) of the weights tensor
-    weights_zp : int
-        zero point of the weights tensor
+    kernel_scale : Union[float, List[float]]
+        scale(s) of the kernel tensor
+    kernel_zp : int
+        zero point of the kernel tensor
     is_depthwise : bool
         whether it is a depthwise convolution
     input_dtype : str
         data type of the input tensor
-    weights_dtype : str
-        data type of the weights tensor
+    kernel_dtype : str
+        data type of the kernel tensor
     output_dtype : str
         data type of the output tensor
 
@@ -184,27 +197,27 @@ def get_conv2d_qnn_params(
     input_max = input_scale * (input_dtype_max - input_zp)
     input_min = input_scale * (input_dtype_min - input_zp)
 
-    weights_dtype_min, weights_dtype_max = get_range_for_dtype_str(weights_dtype)
-    weights_sc_max = np.max(weights_scale)
-    weights_max = weights_sc_max * (weights_dtype_max - weights_zp)
+    kernel_dtype_min, kernel_dtype_max = get_range_for_dtype_str(kernel_dtype)
+    kernel_sc_max = np.max(kernel_scale)
+    kernel_max = kernel_sc_max * (kernel_dtype_max - kernel_zp)
 
-    weights_sc_min = np.min(weights_scale)
-    weights_min = weights_sc_min * (weights_dtype_min - weights_zp)
+    kernel_sc_min = np.min(kernel_scale)
+    kernel_min = kernel_sc_min * (kernel_dtype_min - kernel_zp)
 
-    weights_h = weight_shape[1]
-    weights_w = weight_shape[2]
-    channels = weight_shape[3]
-    num_elements = weights_h * weights_w * channels
+    kernel_h = kernel_shape[1]
+    kernel_w = kernel_shape[2]
+    channels = kernel_shape[3]
+    num_elements = kernel_h * kernel_w * channels
     # Adjust the result if it is a depthwise convolution
     if is_depthwise:
         num_elements = num_elements / channels
 
     # The smallest and largest possible values in the unquantized output tensor
     output_limits = [
-        weights_max * input_max * num_elements,
-        weights_min * input_max * num_elements,
-        weights_min * input_min * num_elements,
-        weights_max * input_min * num_elements,
+        kernel_max * input_max * num_elements,
+        kernel_min * input_max * num_elements,
+        kernel_min * input_min * num_elements,
+        kernel_max * input_min * num_elements,
     ]
 
     output_max = max(output_limits)

From 8300e01a923ad2cf3e0bc5c06f14a07441f926f1 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <ashutosh.parkhi@arm.com>
Date: Fri, 30 Sep 2022 19:56:12 +0100
Subject: [PATCH 2/3] Added few comments and simplified variable assignment

Change-Id: I1956107ff4a52437a525aa34c746acc5e3a31631
---
 python/tvm/relay/op/contrib/cmsisnn.py           | 11 +++++++----
 tests/python/contrib/test_cmsisnn/test_conv2d.py |  3 ++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 1ec70195c9e0..0e82ab8be3db 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -146,17 +146,19 @@ def check_qnn_conv2d(pattern):
         # check if dtypes are supported for the following entities
         # (input_dtype, weight_dtype, bias_dtype, out_dtype, pattern_dtype)
         are_dtypes_valid = False
+        conv2d_input_dtype = conv2d_input.checked_type.dtype
         if bias_add:
             bias_dtype = bias_add.args[1].checked_type.dtype
         else:
-            bias_dtype = "int32" if conv2d_input.checked_type.dtype == "int8" else "int64"
+            # this is only to enable to following check that validates all sorts of dtypes
+            bias_dtype = "int32" if conv2d_input_dtype == "int8" else "int64"
         valid_dtypes = None
-        if conv2d_input.checked_type.dtype == "int8":
+        if conv2d_input_dtype == "int8":
             valid_dtypes = ("int8", "int8", "int32", "int32", "int8")
-        elif conv2d_input.checked_type.dtype == "int16":
+        elif conv2d_input_dtype == "int16":
             valid_dtypes = ("int16", "int8", "int64", "int64", "int16")
         if (
-            conv2d_input.checked_type.dtype,
+            conv2d_input_dtype,
             conv2d_weight.checked_type.dtype,
             bias_dtype,
             conv2d.attrs.out_dtype,
@@ -164,6 +166,7 @@ def check_qnn_conv2d(pattern):
         ) == valid_dtypes:
             are_dtypes_valid = True
 
+        # combination of all checks to decide if pattern is eligible for partitioning
         ret = (
             are_dtypes_valid
             and all([zp == 0 for zp in kernel_zp])
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 9ff55c952c88..63cbb4d2518b 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -257,6 +257,7 @@ def test_conv2d_symmetric_padding(
     strides = (1, 1)
     dilation = (1, 1)
     groups = 1
+    # input_zero_point is not handled by TFLM when int16
     input_zero_point = input_zero_point if dtype == "int8" else 0
     kernel_layout = "HWIO"
     kernel_h = kernel_size[0]
@@ -942,7 +943,7 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
 
 
 def parameterize_for_invalid_model(test):
-    """Generates non int8 inputs"""
+    """Generates non-int8 non-int16 inputs"""
     in_dtype = ["uint8", "int8", "int16"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]

From b5d1995b0a30ca30e538dc0feb94ef7e026aa950 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <ashutosh.parkhi@arm.com>
Date: Mon, 3 Oct 2022 18:04:19 +0100
Subject: [PATCH 3/3] Added check for zeroness of input_zero_point

Change-Id: I62e67fdbe2781c90e55028ff2da88789623f269a
---
 python/tvm/relay/op/contrib/cmsisnn.py           | 15 +++++++++++----
 tests/python/contrib/test_cmsisnn/test_conv2d.py | 11 +++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 0e82ab8be3db..8964937469c4 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -129,10 +129,6 @@ def check_qnn_conv2d(pattern):
         conv2d_input = conv2d.args[0]
         conv2d_weight = conv2d.args[1]
 
-        # kernel zero_point should be 0
-        kernel_zp = conv2d.args[3].data.numpy()
-        kernel_zp = [kernel_zp] if kernel_zp.ndim == 0 else kernel_zp
-
         # check if depthwise Conv2D
         kernel_layout = conv2d.attrs.kernel_layout
         pos_o = kernel_layout.index("O")
@@ -157,6 +153,7 @@ def check_qnn_conv2d(pattern):
             valid_dtypes = ("int8", "int8", "int32", "int32", "int8")
         elif conv2d_input_dtype == "int16":
             valid_dtypes = ("int16", "int8", "int64", "int64", "int16")
+
         if (
             conv2d_input_dtype,
             conv2d_weight.checked_type.dtype,
@@ -166,9 +163,19 @@ def check_qnn_conv2d(pattern):
         ) == valid_dtypes:
             are_dtypes_valid = True
 
+        # input_zero_point should be 0 when int16
+        valid_input_zp = True
+        if conv2d_input_dtype == "int16" and conv2d.args[2].data.numpy().item(0) != 0:
+            valid_input_zp = False
+
+        # kernel zero_point should be 0
+        kernel_zp = conv2d.args[3].data.numpy()
+        kernel_zp = [kernel_zp] if kernel_zp.ndim == 0 else kernel_zp
+
         # combination of all checks to decide if pattern is eligible for partitioning
         ret = (
             are_dtypes_valid
+            and valid_input_zp
             and all([zp == 0 for zp in kernel_zp])
             and (not is_depthwise or bias_add is not None)
         )
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 63cbb4d2518b..66ff5d793880 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -947,17 +947,20 @@ def parameterize_for_invalid_model(test):
     in_dtype = ["uint8", "int8", "int16"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
-    all_combinations = itertools.product(in_dtype, kernel_dtype, kernel_zero_point)
+    input_zero_point = [64, 0]
+    all_combinations = itertools.product(
+        in_dtype, kernel_dtype, kernel_zero_point, input_zero_point
+    )
     all_combinations = filter(
         lambda parameters: not (
-            (parameters[0] == "int8" or parameters[0] == "int16")
+            (parameters[0] == "int8" or (parameters[0] == "int16" and parameters[3] == 0))
             and parameters[1] == "int8"
             and parameters[2] == 0
         ),
         all_combinations,
     )
     return pytest.mark.parametrize(
-        ["in_dtype", "kernel_dtype", "kernel_zero_point"],
+        ["in_dtype", "kernel_dtype", "kernel_zero_point", "input_zero_point"],
         all_combinations,
     )(test)
 
@@ -968,12 +971,12 @@ def test_invalid_parameters(
     in_dtype,
     kernel_dtype,
     kernel_zero_point,
+    input_zero_point,
 ):
     """Tests Depthwise op for non int8 inputs"""
     ifm_shape = (1, 28, 28, 12)
     out_channels = 2
     input_scale = 1
-    input_zero_point = 24
     kernel_scale = [0.11, 0.0237]
 
     kernel_layout = "HWIO"