From f02e26ebbdc043fe929e0cedb030b41311361df3 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 27 Jul 2020 09:31:38 +0100
Subject: [PATCH 1/3] [BYOC][ACL] Depthwise convolution support

Added support for depthwise convolution. ACL only supports depth-wise convolution when kernel size is 3x3 and 5x5 and strides are (1, 1) or (2, 2), if this is not the case then fallback to TVM.

Also rework tests to remove non-deterministic trials.

*Compute Library for the Arm Architecture (ACL).
*All credits to Luke Hutton @lhutton1

Change-Id: Ida1f5802a65377b84325edf14a0149242c1af857
---
 docs/deploy/arm_compute_lib.rst               |   8 +-
 .../tvm/relay/op/contrib/arm_compute_lib.py   | 109 ++++++-
 python/tvm/relay/testing/__init__.py          |   6 +-
 .../contrib/arm_compute_lib/codegen.cc        |  48 ++-
 .../contrib/arm_compute_lib/acl_runtime.cc    |  69 ++++-
 .../contrib/arm_compute_lib/acl_utils.cc      |  10 +
 .../contrib/arm_compute_lib/acl_utils.h       |   9 +
 .../test_arm_compute_lib/infrastructure.py    |  42 ---
 .../test_arm_compute_lib/test_conv2d.py       | 283 +++++++++++-------
 .../test_arm_compute_lib/test_dense.py        |  83 +++--
 .../test_arm_compute_lib/test_network.py      |   4 +-
 11 files changed, 450 insertions(+), 221 deletions(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index a2eaa5fb5662..5d11241c1a34 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -15,7 +15,7 @@
     specific language governing permissions and limitations
     under the License.
 
-Relay Arm :sup:`®` Compute Library Integration
+Relay Arm:sup:`®` Compute Library Integration
 ==============================================
 **Author**: `Luke Hutton <https://github.com/lhutton1>`_
 
@@ -195,12 +195,14 @@ Operator support
 |                      |   Simple: nn.conv2d                                                     |
 |                      |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?                 |
 |                      |                                                                         |
-|                      | (only groups = 1 supported)                                             |
+|                      | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1    |
+|                      | or 2x2) convolution supported. Grouped convolution is not supported.    |
 +----------------------+-------------------------------------------------------------------------+
 | qnn.conv2d           | uint8:                                                                  |
 |                      |   Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize |
 |                      |                                                                         |
-|                      | (only groups = 1 supported)                                             |
+|                      | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1    |
+|                      | or 2x2) convolution supported. Grouped convolution is not supported.    |
 +----------------------+-------------------------------------------------------------------------+
 | nn.dense             | fp32:                                                                   |
 |                      |   Simple: nn.dense                                                      |
diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index a78ad294b770..ae7a3f13076c 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -19,12 +19,15 @@
 import numpy as np
 import tvm
 
+from tvm._ffi import register_func
 from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.testing.temp_op_attr import TempOpAttr
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr
 from .register import register_pattern_table
+from ..strategy.generic import is_depthwise_conv2d
 
 
 def is_arm_compute_runtime_enabled():
@@ -71,6 +74,61 @@ def partition_for_arm_compute_lib(mod, params=None):
     return seq(mod)
 
 
+@register_func("relay.ext.arm_compute_lib.optimize")
+def preprocess_module(mod):
+    """
+        Pre-process a module containing functions ready for ACL codegen. For now we enforce OHWI
+        kernel layout and fold the transforms away.
+
+        Parameters
+        ----------
+        mod : Module
+            The module to run passes on.
+
+        Returns
+        -------
+        preprocessed_mod : The processed module.
+    """
+
+    def convert_layout_conv2d(conv2d_function):
+        def convert_conv(attrs, inputs, tinfos, desired_layouts):
+            new_attrs = dict(attrs)
+            data_info = tinfos[0]
+            weight_info = tinfos[1]
+            desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
+            new_attrs["data_layout"] = desired_data_layout
+            new_attrs["kernel_layout"] = desired_kernel_layout
+
+            if is_depthwise_conv2d(
+                data_info.shape,
+                attrs["data_layout"],
+                weight_info.shape,
+                attrs["kernel_layout"],
+                attrs["groups"],
+            ):
+                dkl = desired_kernel_layout
+                new_attrs["kernel_layout"] = dkl[3] + dkl[1:3] + dkl[0]
+            return conv2d_function(*inputs, **new_attrs)
+
+        return convert_conv
+
+    with TempOpAttr(
+        "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
+    ), TempOpAttr(
+        "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d)
+    ):
+        seq = tvm.transform.Sequential(
+            [
+                transform.ConvertLayout(
+                    {"nn.conv2d": ["NHWC", "OHWI"], "qnn.conv2d": ["NHWC", "OHWI"]}
+                ),
+                transform.FoldConstant(),
+            ]
+        )
+        preprocessed_mod = seq(mod)
+    return preprocessed_mod
+
+
 @register_pattern_table("arm_compute_lib")
 def arm_compute_lib_pattern_table():
     """Get the ACL pattern table."""
@@ -236,8 +294,6 @@ def _func_wrapper(expr):
 def conv2d(expr):
     """Check if the external ACL codegen for conv2d should be used."""
     attrs, args = expr.attrs, expr.args
-    if attrs.groups != 1:
-        return False
     if attrs.data_layout != "NHWC":
         return False
     if attrs.out_dtype != "float32" and attrs.out_dtype != "":
@@ -248,14 +304,25 @@ def conv2d(expr):
     kernel_typ = args[1].checked_type
     if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32":
         return False
+    is_depthwise = is_depthwise_conv2d(
+        data_typ.shape,
+        attrs["data_layout"],
+        kernel_typ.shape,
+        attrs["kernel_layout"],
+        attrs["groups"],
+    )
+    if is_depthwise:
+        return depthwise_conv2d(attrs, args)
+    # ACL doesn't support grouped convolution
+    if attrs.groups != 1 and not is_depthwise:
+        return False
     return True
 
 
 def qnn_conv2d(expr):
     """Check if the external ACL codegen for qnn.conv2d should be used."""
     attrs, args = expr.attrs, expr.args
-    if attrs.groups != 1:
-        return False
+
     if attrs.data_layout != "NHWC":
         return False
     if attrs.out_dtype != "int32" and attrs.out_dtype != "":
@@ -266,6 +333,40 @@ def qnn_conv2d(expr):
     kernel_typ = args[1].checked_type
     if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "uint8":
         return False
+    is_depthwise = is_depthwise_conv2d(
+        data_typ.shape,
+        attrs["data_layout"],
+        kernel_typ.shape,
+        attrs["kernel_layout"],
+        attrs["groups"],
+    )
+    if is_depthwise:
+        return depthwise_conv2d(attrs, args)
+    # ACL doesn't support grouped convolution
+    if attrs.groups != 1 and not is_depthwise:
+        return False
+    return True
+
+
+def depthwise_conv2d(attrs, args):
+    """Check if the external ACL codegen for depthwise convolution should be used.
+
+    Note
+    ----
+    Relay does not have a depthwise conv2d operator whilst ACL does. We simply
+    separate the checks for depthwise for clarity.
+    """
+    kernel_typ = args[1].checked_type
+    # Only supports 3x3, 5x5 depthwise
+    if (
+        kernel_typ.shape[0] not in [3, 5]
+        or kernel_typ.shape[1] not in [3, 5]
+        or kernel_typ.shape[0] != kernel_typ.shape[1]
+    ):
+        return False
+    # Stride must be (1, 1) or (2, 2)
+    if (attrs.strides[0], attrs.strides[1]) not in [(1, 1), (2, 2)]:
+        return False
     return True
 
 
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 0b81cb9c7ec6..f0c79bed1218 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -22,9 +22,9 @@
 
 import tvm
 from tvm import te
-import tvm.relay as relay
-import tvm.relay.op as op
-from tvm.relay import Prelude
+from tvm import relay
+from tvm.relay import op
+from tvm.relay.prelude import Prelude
 from tvm.testing import enabled_targets
 
 from . import mlp
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index a963242f82d5..e0669ae64bdb 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -24,6 +24,7 @@
 #include <tvm/ir/module.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/type.h>
+#include <tvm/tir/analysis.h>
 
 #include <memory>
 #include <string>
@@ -126,7 +127,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       nodes.activation = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
-    if (backend::IsOp(current_call, "nn.bias_add")) {
+    if (backend::IsOp(current_call, "add")) {
       nodes.bias = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
@@ -154,19 +155,32 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
    */
   std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* cn) {
     CompositeConvNode nodes = UnpackCompositeConvolution(cn);
-    std::string name = "nn.conv2d";
 
     const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
     ICHECK(conv_attr);
-    ICHECK(conv_attr->kernel_layout == "OHWI")
-        << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
+
+    std::string name;
+    std::string name_prefix = "nn";
+
+    // Distinguish between normal and depth-wise convolution
+    if (conv_attr->channels.defined() &&
+        tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) &&
+        conv_attr->groups != 1) {
+      name = "depthwise_conv2d";
+      ICHECK(conv_attr->kernel_layout == "IHWO")
+          << "Kernel layout must be IHWO, has the module been pre-processed correctly?";
+    } else {
+      name = "conv2d";
+      ICHECK(conv_attr->kernel_layout == "OHWI")
+          << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
+    }
 
     // Inputs must be added in the same order they appear in the relay graph.
     std::vector<JSONGraphNodeEntry> inputs;
     inputs.push_back(VisitExpr(cn->args[0])[0]);
     inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
     if (nodes.requantize) {
-      name = "qnn.conv2d";
+      name_prefix = "qnn";
       inputs.push_back(VisitExpr(nodes.conv->args[2])[0]);  // input zero-point
       inputs.push_back(VisitExpr(nodes.conv->args[3])[0]);  // kernel zero-point
       inputs.push_back(VisitExpr(nodes.conv->args[4])[0]);  // input scale
@@ -180,7 +194,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       inputs.push_back(VisitExpr(nodes.requantize->args[4])[0]);  // output zero-point
     }
 
-    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    auto json_node = std::make_shared<JSONGraphNode>(name_prefix + "." + name, "kernel", inputs, 1);
     SetCallNodeAttribute(json_node, nodes.conv);
 
     // Override attributes
@@ -224,10 +238,11 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       nodes.requantize = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
-    if (backend::IsOp(current_call, "nn.bias_add")) {
+    if (backend::IsOp(current_call, "add")) {
       nodes.bias = current_call;
       current_call = current_call->args[0].as<CallNode>();
     }
+
     // Enforce a dense node exists at this point during traversal
     if (nodes.requantize) {
       ICHECK(backend::IsOp(current_call, "qnn.dense"));
@@ -329,25 +344,6 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
   }
 };
 
-/*!
- * \brief Pre-process a module containing functions ready for ACL codegen.
- *
- * For now we enforce OHWI kernel layout and fold the transforms away.
- *
- * \param mod The module to be pre-processed.
- * \return The processed module.
- */
-IRModule PreProcessModule(const IRModule& mod) {
-  IRModule preprocessed_module;
-  tvm::Map<String, Array<String>> desired_layouts = {{"nn.conv2d", {"NHWC", "OHWI"}},
-                                                     {"qnn.conv2d", {"NHWC", "OHWI"}}};
-  preprocessed_module = transform::ConvertLayout(desired_layouts)(mod);
-  preprocessed_module = transform::FoldConstant()(preprocessed_module);
-  return preprocessed_module;
-}
-
-TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib.optimize").set_body_typed(PreProcessModule);
-
 /*!
  * \brief Create a runtime module for ACL.
  *
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index 09879bdc6e95..21fb94cb51b2 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -32,6 +32,7 @@
 #include <arm_compute/core/Types.h>
 #include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
@@ -131,6 +132,9 @@ class ACLRuntime : public JSONRuntimeBase {
         if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) {
           CreateConvolution2DLayer(&layer_, node, mm);
           num_pools++;
+        } else if ("nn.depthwise_conv2d" == op_name || "qnn.depthwise_conv2d" == op_name) {
+          CreateDepthwiseConvolution2DLayer(&layer_, node, mm);
+          num_pools++;
         } else if ("nn.dense" == op_name || "qnn.dense" == op_name) {
           CreateFullyConnectedLayer(&layer_, node, mm);
           num_pools++;
@@ -227,12 +231,7 @@ class ACLRuntime : public JSONRuntimeBase {
     arm_compute::ActivationLayerInfo act_info;
     if (node.HasAttr("activation_type")) {
       std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
-      if (activation_type == "relu") {
-        act_info = arm_compute::ActivationLayerInfo(
-            arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
-      } else {
-        LOG(FATAL) << "Unsupported activation function";
-      }
+      act_info = MakeACLActivationInfo(activation_type);
     }
 
     arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));
@@ -269,6 +268,64 @@ class ACLRuntime : public JSONRuntimeBase {
     layer->function = function;
   }
 
+  /*!
+   * \brief Create a 2D depthwise convolution layer.
+   *
+   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
+   * \param node The JSON representation of the operator.
+   * \param mm The ACL conv2d layer can request auxiliary memory from TVM.
+   */
+  void CreateDepthwiseConvolution2DLayer(
+      CachedLayer* layer, const JSONGraphNode& node,
+      const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
+    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
+    arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);
+
+    arm_compute::ActivationLayerInfo act_info;
+    if (node.HasAttr("activation_type")) {
+      std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
+      act_info = MakeACLActivationInfo(activation_type);
+    }
+
+    arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));
+
+    // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases.
+    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+    size_t num_inputs = inputs.size();
+    bool has_bias;
+    if (node.GetOpName() == "qnn.depthwise_conv2d") {
+      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+          << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
+      has_bias = num_inputs == 9;
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
+      if (has_bias) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
+      }
+      layer->outputs.push_back(
+          MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
+    } else {
+      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+          << "Convolution requires 3 inputs with a bias, 2 inputs without.";
+      has_bias = num_inputs == 3;
+      for (const auto& i : inputs) {
+        layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
+      }
+      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+    }
+
+    // Depth multiplier is the final dimension in acl weights tensor (IWH*M*)
+    int depth_multiplier = layer->inputs[1].info()->tensor_shape()[3];
+
+    auto function = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>(mm);
+    function->configure(&layer->inputs[0], &layer->inputs[1],
+                        has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info,
+                        depth_multiplier, act_info, dilation_2d);
+    layer->function = function;
+  }
+
   /*!
    * \brief Create a fully connected (dense) layer.
    *
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 604c619bf49c..3b2620987ab0 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -134,6 +134,16 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) {
   }
 }
 
+arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type) {
+  auto act_func = arm_compute::ActivationLayerInfo::ActivationFunction::IDENTITY;
+  if (activation_type == "relu") {
+    act_func = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+  } else {
+    LOG(FATAL) << "Activation " << activation_type << " unsupported by ACL runtime";
+  }
+  return {act_func};
+}
+
 template <typename T>
 std::vector<T> GetVectorFromDLTensor(const DLTensor* tensor) {
   ICHECK(tensor) << "Cannot convert a nullptr";
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h
index 576ed916ff60..dbb006fbb347 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.h
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h
@@ -108,6 +108,15 @@ arm_compute::PadStrideInfo MakeACLPadStride(const std::vector<std::string>& pad,
  */
 arm_compute::DataType MakeACLDataType(const DLDataType& data_type);
 
+/*!
+ * \brief Convert string to arm_compute::ActivationLayerInfo
+ *
+ * \param activation_type A string representing activation function.
+ * Currently supports the following options: "relu".
+ * \return arm_compute::ActivationLayerInfo.
+ */
+arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type);
+
 /*!
  * \brief Get a vector from DLTensor data.
  * \note Performs a copy of data.
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index c5d711d7afa3..80cd5847440e 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -303,45 +303,3 @@ def verify_codegen(
             f"Actual={codegen_str} \n"
             f"Expected={known_good_codegen_str}"
         )
-
-
-def generate_trials(space, r_factor=3):
-    """Generates a series of trials.
-
-    This algorithm generates a series of non-deterministic trials given a
-    space of options to test. A trial is generated by pulling a value from
-    each option in the space. On some occasions the values are shuffled to
-    ensure a different trial on each r_factor iteration. The algorithm ensures
-    that each value from an option is used at least once. The total number of
-    trials is determined by the r_factor * the option with the largest number
-    of values.
-
-    Parameters
-    ----------
-    space: List[List[Any]]
-        A list of different options with varying values to test.
-    r_factor: (optional) int
-        The repeat factor.
-
-    Returns
-    -------
-    A list of trials specifying values for each option.
-
-    """
-    np.random.seed(0)
-    max_len = 1
-    for option in space:
-        max_len = max(max_len, len(option))
-
-    num_trials = r_factor * max_len
-    trials = []
-    for i in range(num_trials):
-        trial = []
-        for option in space:
-            if i % len(option) == 0:
-                np.random.shuffle(option)
-            trial.append(option[i % len(option)])
-
-        trials.append(trial)
-
-    return trials
diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
index 4496a2a1afa9..cc5bbfec7c69 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py
@@ -21,15 +21,14 @@
 import tvm
 from tvm import relay
 
-from .infrastructure import (
+from test_arm_compute_lib.infrastructure import (
     skip_runtime_test,
     skip_codegen_test,
     build_and_run,
     verify,
     verify_codegen,
-    generate_trials,
 )
-from .infrastructure import Device
+from test_arm_compute_lib.infrastructure import Device
 
 
 def _get_model(
@@ -57,7 +56,12 @@ def _get_model(
         if len(padding) == 2:
             padding = (padding[0], padding[1], padding[0], padding[1])
         shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3])
-    weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    is_depthwise = shape[3] == channels == groups
+    weight_format = "HWOI" if is_depthwise else "HWIO"
+    if weight_format == "HWIO":
+        weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    else:
+        weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups)
     w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype))
     weights = relay.const(w, dtype)
     out = relay.nn.conv2d(
@@ -65,7 +69,7 @@ def _get_model(
         weights,
         kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
-        kernel_layout="HWIO",
+        kernel_layout=weight_format,
         dilation=dilation,
         strides=strides,
         padding=padding,
@@ -75,7 +79,8 @@ def _get_model(
     )
     params = {"w": w}
     if has_bias:
-        b = tvm.nd.array(np.random.uniform(-128, 127, weight_shape[3]).astype(dtype))
+        bias_shape = weight_shape[2] if is_depthwise else weight_shape[3]
+        b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype(dtype))
         biasc = relay.const(b, dtype)
         out = relay.nn.bias_add(out, biasc, axis=3)
         params["b"] = b
@@ -134,7 +139,12 @@ def _get_qnn_model(
         if len(padding) == 2:
             padding = (padding[0], padding[1], padding[0], padding[1])
         shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3])
-    weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    is_depthwise = shape[3] == channels == groups
+    weight_format = "HWOI" if is_depthwise else "HWIO"
+    if weight_format == "HWIO":
+        weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels)
+    else:
+        weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups)
     w = tvm.nd.array(np.random.uniform(0, 255, weight_shape).astype(dtype))
     weights = relay.const(w, dtype)
     out = relay.qnn.op.conv2d(
@@ -146,7 +156,7 @@ def _get_qnn_model(
         kernel_scale=relay.const(kernel_sc, "float32"),
         kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
-        kernel_layout="HWIO",
+        kernel_layout=weight_format,
         dilation=dilation,
         strides=strides,
         padding=padding,
@@ -156,7 +166,8 @@ def _get_qnn_model(
     )
     params = {"w": w}
     if has_bias:
-        b = tvm.nd.array(np.random.uniform(0, 255, weight_shape[3]).astype("int32"))
+        bias_shape = weight_shape[2] if is_depthwise else weight_shape[3]
+        b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype("int32"))
         biasc = relay.const(b, "int32")
         out = relay.nn.bias_add(out, biasc, axis=3)
         params["b"] = b
@@ -188,21 +199,30 @@ def _get_expected_codegen(
 ):
     if len(padding) == 2:
         padding = (padding[0], padding[1], padding[0], padding[1])
-    weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups)
     output_height = ((shape[1] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1
     output_width = ((shape[2] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1
     output_shape = (1, int(output_height), int(output_width), channels)
     out_dtype = "int32" if dtype == "uint8" else "float32"
+    is_depthwise = shape[3] == channels == groups
+    weight_format = "IHWO" if is_depthwise else "OHWI"
+    if weight_format == "IHWO":
+        weight_shape = (shape[3] // groups, kernel_h, kernel_w, channels)
+    else:
+        weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups)
+    if is_depthwise:
+        name = "nn.depthwise_conv2d"
+    else:
+        name = "nn.conv2d"
 
     node = {
         "op": "kernel",
-        "name": "nn.conv2d",
+        "name": name,
         "inputs": [],
         "attrs": {
-            "groups": [["1"]],
+            "groups": [[str(groups)]],
             "num_outputs": "1",
             "data_layout": [["NHWC"]],
-            "kernel_layout": [["OHWI"]],
+            "kernel_layout": [[weight_format]],
             "channels": [[str(channels)]],
             "dilation": [[str(dilation[0]), str(dilation[1])]],
             "out_layout": [[""]],
@@ -229,7 +249,7 @@ def _get_expected_codegen(
 
     # qnn.conv2d params, input and kernel
     if dtype == "uint8":
-        node["name"] = "qnn.conv2d"
+        node["name"] = "qnn." + node["name"].split(".")[1]
         for param_dtype in ["int32", "float32"]:
             for _ in range(2):
                 inputs.append(
@@ -246,7 +266,10 @@ def _get_expected_codegen(
             {
                 "op": "const",
                 "name": "",
-                "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [[bias_dtype]]},
+                "attrs": {
+                    "shape": [[[1, 1, 1, weight_shape[3] if is_depthwise else weight_shape[0]]]],
+                    "dtype": [[bias_dtype]],
+                },
             }
         )
 
@@ -275,29 +298,43 @@ def test_conv2d():
     device = Device()
     np.random.seed(0)
 
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2), (2, 1)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "float32"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         outputs = []
         inputs = {
             "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)),
@@ -338,31 +375,43 @@ def test_codegen_conv2d():
     if skip_codegen_test():
         return
 
-    np.random.seed(0)
-
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2), (2, 1)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "float32"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         inputs = {"a"}
 
         args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
@@ -389,29 +438,43 @@ def test_qnn_conv2d():
     device = Device()
     np.random.seed(0)
 
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "uint8"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
 
@@ -463,36 +526,52 @@ def test_qnn_conv2d():
             "output scale": output_sc,
             "output zero point": output_zp,
         }
-        verify(outputs, atol=1, rtol=0, config=config, verify_saturation=True)
+
+        atol = 2 if is_depthwise else 1
+        verify(outputs, atol=atol, rtol=0, config=config, verify_saturation=True)
 
 
 def test_codegen_qnn_conv2d():
     if skip_codegen_test():
         return
 
-    kernel_hs = [1, 2, 3, 5]
-    kernel_ws = [1, 2, 3, 5]
-    pad = [(1, 1), (2, 2), (2, 1)]
-    strides = [(1, 1), (2, 2)]
-    dilation = [(1, 1)]
-    out_channels = [4, 7, 16]
-    input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)]
-    # composite operator (pad, bias, activation)
-    composite = [
-        (False, False, False),
-        (False, True, False),
-        (False, False, True),
-        (False, True, True),
-        (True, False, False),
-    ]
     dtype = "uint8"
-    trials = generate_trials(
-        [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3
-    )
+    trials = [
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True],
+    ]
 
-    for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials:
-        groups = 1
-        shape = (1, *input_shapes)
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+        is_depthwise,
+    ) in trials:
+        shape = (1, *shape)
+        if is_depthwise:
+            groups = shape[3]
+        else:
+            groups = 1
         inputs = {"a"}
 
         input_zp = 100
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index 0279aa72eaf7..dba7be67a012 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -28,7 +28,6 @@
     build_and_run,
     verify,
     verify_codegen,
-    generate_trials,
 )
 
 
@@ -184,17 +183,19 @@ def test_dense():
     device = Device()
     np.random.seed(0)
 
-    dtype = ["float32"]
-    shape = [
-        (1, (1, 128), (16, 128), 16),
-        (1, (32, 32), (32, 32), 32),
-        (0, (1, 64), (1, 64), 1),
-        (0, (11, 2), (2, 2), 2),
+    dtype = "float32"
+    trials = [
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
+        [(11, 2), (2, 2), 2, True, 0],
+        [(11, 2), (2, 2), 2, False, 0],
     ]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
         func, params = _get_model(
@@ -230,19 +231,26 @@ def test_codegen_dense():
 
     np.random.seed(0)
 
-    dtype = ["float32"]
-    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
+    dtype = "float32"
+    trials = [
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
+    ]
 
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         inputs = {"a"}
 
         args = (shape, weight_shape, units, dtype)
 
         func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite)
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions)
+        verify_codegen(
+            func, exp_codegen, acl_partitions, (1 - acl_partitions) * (2 - int(not composite))
+        )
 
 
 def test_qnn_dense():
@@ -254,19 +262,21 @@ def test_qnn_dense():
     device = Device()
     np.random.seed(0)
 
-    dtype = ["uint8"]
-    shape = [
-        (0, (4, 4), (4, 4), 4),
-        (1, (16, 16), (4, 16), 4),
-        (1, (1, 128), (16, 128), 16),
-        (1, (32, 32), (32, 32), 32),
-        (0, (1, 64), (1, 64), 1),
+    dtype = "uint8"
+    trials = [
+        [(4, 4), (4, 4), 4, True, 0],
+        [(4, 4), (4, 4), 4, False, 0],
+        [(16, 16), (4, 16), 4, True, 1],
+        [(16, 16), (4, 16), 4, False, 1],
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
     ]
 
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
-
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
         input_zp = 100
@@ -328,12 +338,17 @@ def test_codegen_qnn_dense():
 
     np.random.seed(0)
 
-    dtype = ["uint8"]
-    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
-    composite = [False, True]
-    trials = generate_trials([dtype, shape, composite], 3)
+    dtype = "uint8"
+    trials = [
+        [(1, 128), (16, 128), 16, True, 1],
+        [(1, 128), (16, 128), 16, False, 1],
+        [(32, 32), (32, 32), 32, True, 1],
+        [(32, 32), (32, 32), 32, False, 1],
+        [(1, 64), (1, 64), 1, True, 0],
+        [(1, 64), (1, 64), 1, False, 0],
+    ]
 
-    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
+    for shape, weight_shape, units, composite, acl_partitions in trials:
         inputs = {"a"}
         args = (shape, weight_shape, units, dtype)
 
@@ -357,7 +372,9 @@ def test_codegen_qnn_dense():
             has_bias=composite,
         )
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions)
+        verify_codegen(
+            func, exp_codegen, acl_partitions, (1 - acl_partitions) * (3 - int(not composite))
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py
index 898446b32ed9..462df143b447 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_network.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_network.py
@@ -123,7 +123,7 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=73, acl_partitions=18, atol=0.002, rtol=0.01
+        *get_model(), device=device, tvm_ops=56, acl_partitions=31, atol=0.002, rtol=0.01
     )
 
 
@@ -148,7 +148,7 @@ def get_model():
         return mod, params, inputs
 
     _build_and_run_network(
-        *get_model(), device=device, tvm_ops=42, acl_partitions=17, atol=8, rtol=0
+        *get_model(), device=device, tvm_ops=3, acl_partitions=30, atol=9, rtol=0
     )
 
 

From 6067c6dd2a75c6dc4da319b72ca9840ad955edf9 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Fri, 8 Jan 2021 15:08:35 +0000
Subject: [PATCH 2/3] linter

---
 python/tvm/relay/op/contrib/arm_compute_lib.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index ae7a3f13076c..8a03cb173612 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -77,17 +77,17 @@ def partition_for_arm_compute_lib(mod, params=None):
 @register_func("relay.ext.arm_compute_lib.optimize")
 def preprocess_module(mod):
     """
-        Pre-process a module containing functions ready for ACL codegen. For now we enforce OHWI
-        kernel layout and fold the transforms away.
+    Pre-process a module containing functions ready for ACL codegen. For now we enforce OHWI
+    kernel layout and fold the transforms away.
 
-        Parameters
-        ----------
-        mod : Module
-            The module to run passes on.
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
 
-        Returns
-        -------
-        preprocessed_mod : The processed module.
+    Returns
+    -------
+    preprocessed_mod : The processed module.
     """
 
     def convert_layout_conv2d(conv2d_function):

From b3a9c2df2f48fd7ea635a2e1aab74443069afbcb Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Fri, 8 Jan 2021 20:52:03 +0000
Subject: [PATCH 3/3] CHECK -> ICHECK

---
 src/runtime/contrib/arm_compute_lib/acl_runtime.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index 21fb94cb51b2..ed8f6adbd083 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -296,7 +296,7 @@ class ACLRuntime : public JSONRuntimeBase {
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.depthwise_conv2d") {
-      CHECK(num_inputs >= 8U && num_inputs <= 9U)
+      ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
       has_bias = num_inputs == 9;
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
@@ -307,7 +307,7 @@ class ACLRuntime : public JSONRuntimeBase {
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
-      CHECK(num_inputs >= 2U && num_inputs <= 3U)
+      ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Convolution requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {