From 30fe89b9edb4bd1be1eac0bedd2a621345721e1d Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 27 Aug 2020 19:54:53 +0100
Subject: [PATCH 01/10] Add dot product support for quantized convolution.

We added two new intrinsics in: topi/arm_cpu/tensor_intrin.py, namely
- mmla4x4: compute a matrix multiplication between tile A(4,4) and tile
  B(4,4)
- mmla16x4: compute a matrix multiplication between tile A(rows,4) and tile
  B(4,16)
Then we used those intrinsics in two separate strategies. We added the
strategies in topi/arm_cpu/conv2d_int8.py and implemented the schedules
in topi/arm_cpu/conv2d_gemm.py. In particular:
- schedule_conv2d_gemm, when accelerated, packs matrix A, compute GEMM,
  and unpack the resulting matrix. This uses the mmla4x4 intrinsic
- schedule_conv2d_gemm_hybrid doesn't do any packing on A and C which
  are in native form.  This uses the mmla16x4 intrinsic

Please note that for the limitations of `tensorize` we need to pad
matrix A in both cases (when dimensions are not multiple of the tiling
shape)

Change-Id: Id0d818d84ffc458c6dad7983fd350a0f3d5db395
---
 python/tvm/relay/op/strategy/arm_cpu.py    |  38 +--
 python/tvm/topi/arm_cpu/arm_utils.py       |  29 +++
 python/tvm/topi/arm_cpu/conv2d_alter_op.py |  95 +++++---
 python/tvm/topi/arm_cpu/conv2d_gemm.py     | 265 ++++++++++++++-------
 python/tvm/topi/arm_cpu/conv2d_int8.py     | 101 ++++++--
 python/tvm/topi/arm_cpu/tensor_intrin.py   | 231 +++++++++++++++---
 6 files changed, 582 insertions(+), 177 deletions(-)
 create mode 100644 python/tvm/topi/arm_cpu/arm_utils.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 54624ce35b55..77e9a010629e 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -135,20 +135,25 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="conv2d_direct_simd.micro_dev",
                 )
             elif kernel_layout == "HWIO":
-                is_aarch64 = "aarch64" in str(isa.target)
-
+                is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
+                has_dot_prod = topi.arm_cpu.arm_utils.is_fast_int8_on_arm()
+                if has_dot_prod and data.dtype in ["int8", "uint8"]:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid),
+                        name="conv2d_NHWC_quantized_hybrid.arm_cpu")
                 if is_aarch64 and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-                        name="conv2d_NHWC_quantized.arm_cpu",
-                    )
-
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                    name="conv2d_nhwc_spatial_pack.arm_cpu",
-                )
+                        name="conv2d_NHWC_quantized.arm_cpu")
+                # TODO
+                # This strategy errors out when tuning. Let us comment it out
+                # but not remove.
+                # strategy.add_implementation(
+                #     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
+                #     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+                #     name="conv2d_nhwc_spatial_pack.arm_cpu")
             else:
                 raise RuntimeError(
                     "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout)
@@ -328,12 +333,17 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
     data = inputs[0]
     strategy = _op.OpStrategy()
 
-    if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
+    interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform
+    hybrid_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid_without_transform
+    if layout == "NHWC" and data.dtype in ['int8', 'uint8']:
         strategy.add_implementation(
-            wrap_compute_conv2d_gemm(topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform),
+            wrap_compute_conv2d_gemm(interleaved_compute),
             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-            name="conv2d_NHWC_quantized_without_transform.arm_cpu",
-        )
+            name="conv2d_NHWC_quantized_without_transform.arm_cpu")
+        strategy.add_implementation(
+            wrap_compute_conv2d_gemm(hybrid_compute),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid),
+            name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu")
     else:
         raise RuntimeError(
             "Unsupported conv2d_NHWC_quantized_without_transform layout {0}"
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
new file mode 100644
index 000000000000..35e43a36562e
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Arm target utility functions"""
+
+import tvm
+def is_fast_int8_on_arm():
+    """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
+    target = tvm.target.Target.current(allow_none=False)
+    return "+v8.2a" in target.mattr and "+dotprod" in target.mattr
+
+def is_aarch64_arm():
+    """ Checks whether we are compiling for an AArch64 target. """
+    target = tvm.target.Target.current(allow_none=False)
+    return 'aarch64' in target.attrs.get("mtriple", "")
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 7bf7e42237d2..bb65cb2d4664 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -27,9 +27,47 @@
 from ..nn import conv2d_alter_layout
 from ..util import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
+from .arm_utils import is_fast_int8_on_arm
 
 logger = logging.getLogger("topi")
 
+def interleave_transpose_B(inputs, data, kernel, interleave_A):
+    """ Return the new placeholder and the expression that represent
+    the matrix B transposed and interleaved"""
+
+    assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
+            data.dtype == 'uint8' and kernel.dtype == 'uint8')
+
+    KH, KW, IC, OC = get_const_tuple(kernel.shape)
+    K = KH * KW * IC
+    N = OC
+
+    if is_fast_int8_on_arm():
+        tile_rows_B = 12 if interleave_A else 16
+        tile_cols_B = 4
+    else:
+        tile_rows_B = 4
+        tile_cols_B = 16
+
+    pad_K = 0
+    pad_N = 0
+
+    if N % tile_rows_B != 0:
+        pad_N = tile_rows_B - (N % tile_rows_B)
+    if K % tile_cols_B != 0:
+        pad_K = tile_cols_B - (K % tile_cols_B)
+
+    N_padded = N + pad_N
+    K_padded = K + pad_K
+    new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1],
+                                                                    tile_rows_B,
+                                                                    tile_cols_B)
+    new_kernel = te.placeholder((N_padded // tile_rows_B,
+                                 K_padded // tile_cols_B,
+                                 tile_rows_B,
+                                 tile_cols_B), kernel.dtype)
+    return new_kernel, new_kernel_expr
+
 
 @conv2d_alter_layout.register(["arm_cpu"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
@@ -280,43 +318,34 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
     if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu":
-        assert (
-            data.dtype == "int8"
-            and kernel.dtype == "int8"
-            or data.dtype == "uint8"
-            and kernel.dtype == "uint8"
-        )
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, IC, OC = get_const_tuple(kernel.shape)
-        K = KH * KW * IC
         N = OC
-
-        tile_rows = 4
-        tile_cols = 16
-        pad_K = 0
-        pad_N = 0
-
-        if N % tile_rows != 0:
-            pad_N = tile_rows - (N % tile_rows)
-        if K % tile_cols != 0:
-            pad_K = tile_cols - (K % tile_cols)
-
-        N_padded = N + pad_N
-        K_padded = K + pad_K
-        kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], tile_rows, tile_cols)
-        new_kernel = te.placeholder(
-            (N_padded // tile_rows, K_padded // tile_cols, tile_rows, tile_cols), kernel.dtype
-        )
-
         new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu"
-        new_workload = autotvm.task.args_to_workload(
-            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
-            new_workload_name,
-        )
+        new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data,
+                                                             kernel, interleave_A=True)
+        new_workload = autotvm.task.args_to_workload([data, new_kernel,
+                                                      strides, padding, dilation,
+                                                      out_dtype, (KH, KW), OC],
+                                                     new_workload_name)
         dispatch_ctx.update(target, new_workload, cfg)
 
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
-            inputs[0], kernel_expr, **new_attrs
-        )
-
+        return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0],
+                                                                     new_kernel_expr,
+                                                                     **new_attrs)
+    if topi_tmpl == "conv2d_NHWC_quantized_hybrid.arm_cpu":
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+        KH, KW, IC, OC = get_const_tuple(kernel.shape)
+        N = OC
+        new_workload_name = "conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu"
+        new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data,
+                                                             kernel, interleave_A=False)
+        new_workload = autotvm.task.args_to_workload([data, new_kernel,
+                                                      strides, padding, dilation,
+                                                      out_dtype, (KH, KW), OC],
+                                                     new_workload_name)
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0],
+                                                                     new_kernel_expr,
+                                                                     **new_attrs)
     return None
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 7f73cc828fa7..14a192b38821 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -23,19 +23,44 @@
 from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity
 from ..util import get_const_tuple, get_const_int
 from ..nn.util import get_pad_tuple
-from .tensor_intrin import gemm_quantized, gemm_quantized_impl
+from .tensor_intrin import gemm_quantized, gemm_quantized_impl,\
+                           mmla_4x4_int8_int8_int32, mmla_16x4_int8_int8_int32
+from .arm_utils import is_aarch64_arm, is_fast_int8_on_arm
 
+def configure_knobs(cfg, M, K):
+    """ Configure auto-tuning knobs for the interleaved strategy """
 
-def is_aarch64_arm():
-    """ Checks whether we are compiling for an AArch64 target. """
-    target = tvm.target.Target.current(allow_none=False)
-    return "aarch64" in target.attrs.get("mtriple", "")
+    x, y = cfg.axis(M // 4), cfg.axis(K // 16)
+    cfg.define_reorder('reorder_gemm',
+                       [x, y],
+                       policy='candidate',
+                       candidate=[[x, y],
+                                  [y, x]])
 
+    outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
+    cfg.define_annotate("A_interleaved_unroll_vec",
+                        [outer_loop, inner_loop],
+                        policy="try_unroll_vec")
+
+    # Fallback configuration
+    if cfg.is_fallback:
+        cfg['reorder_gemm'] = ReorderEntity([0, 1])
+        cfg['A_interleaved_unroll_vec'] = AnnotateEntity(["unroll", "vec"])
+
+    if not is_fast_int8_on_arm():
+        cfg.define_knob('gemm_quantized_unroll', [True, False])
+        cfg.define_knob('gemm_quantized_interleave', [True, False])
+
+        if cfg.is_fallback:
+            cfg['gemm_quantized_unroll'] = OtherOptionEntity(False)
+            cfg['gemm_quantized_interleave'] = OtherOptionEntity(True)
 
 # Compute function
-def compute_conv2d_gemm_without_weight_transform(
-    cfg, data, B_interleaved_t, strides, padding, dilation, out_dtype, kernel_size, output_channels
-):
+def compute_conv2d_gemm_without_weight_transform(cfg,
+                                                 data, B_interleaved_t,
+                                                 strides, padding, dilation,
+                                                 out_dtype, kernel_size,
+                                                 output_channels, interleave_A):
     """Compute conv2d by transforming the input,
     executing GEMM and transforming the output back"""
     batches, IH, IW, IC = get_const_tuple(data.shape)
@@ -74,38 +99,38 @@ def compute_conv2d_gemm_without_weight_transform(
 
     A_shape = (batches, M, K)
     if K_AREA == 1:
-        A = te.compute(
-            A_shape,
-            lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y],
-            name="data_flatten",
-        )
+        A = tvm.topi.reshape(data_pad, A_shape)
     else:
-        A = te.compute(
-            A_shape,
-            lambda n, x, y: data_pad[
-                n,
-                HSTR * (x // OW) + dilation_h * ((y // IC) // KW),
-                WSTR * (x % OW) + dilation_w * ((y // IC) % KW),
-                y % IC,
-            ],
-            name="data_im2col",
-        )
-    N_transformed = B_interleaved_t.shape[0]
+        A = te.compute(A_shape, lambda n, x, y:
+                       data_pad[n,
+                                HSTR * (x // OW) + dilation_h * ((y // IC) // KW),
+                                WSTR * (x % OW) + dilation_w * ((y // IC) % KW), y % IC],
+                       name='data_im2col')
 
     # --- Pad if necessary
-    idxm = tvm.tir.indexmod
+    N_transformed = B_interleaved_t.shape[0]
+    tile_rows_B = B_interleaved_t.shape[2]
+    tile_cols_B = B_interleaved_t.shape[3]
+
+    if is_fast_int8_on_arm() and interleave_A:
+        tile_rows_A = 8
+        tile_cols_A = 4
+    else:
+        tile_rows_A = 4
+        tile_cols_A = 16
 
     pad_m = 0
     pad_k = 0
 
-    if M % 4 != 0:
-        pad_m = 4 - (M % 4)
+    if M % tile_rows_A != 0:
+        pad_m = tile_rows_A - (M % tile_rows_A)
 
-    if K % 16 != 0:
-        pad_k = 16 - (K % 16)
+    if K % tile_cols_A != 0:
+        pad_k = tile_cols_A - (K % tile_cols_A)
 
     M_padded = M + pad_m
     K_padded = K + pad_k
+    N_padded = N_transformed * tile_rows_B
 
     pad_before = (0, 0, 0)
     pad_after = (0, pad_m, pad_k)
@@ -113,59 +138,75 @@ def compute_conv2d_gemm_without_weight_transform(
     if pad_m != 0 or pad_k != 0:
         A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded")
 
-    # --- GEMM: A*B'
+    idxm = tvm.tir.indexmod
     k = te.reduce_axis((0, K_padded), "k")
 
-    A_interleaved = te.compute(
-        (batches, M_padded // 4, K_padded // 16, 4, 16),
-        lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y],
-        name="A_interleaved",
-    )
-
-    C_interleaved = te.compute(
-        (batches, M_padded // 4, N_transformed, 4, 4),
-        lambda b, x, y, w, z: te.sum(
-            A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype)
-            * B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype),
-            axis=k,
-        ),
-        name="C_interleaved",
-    )
-
-    # --- Unpack C
-    C = te.compute(
-        (batches, M, N),
-        lambda b, x, y: C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)],
-        name="C",
-    )
+    if interleave_A:
+        # Configuration space
+        configure_knobs(cfg, M_padded, K_padded)
+
+        # Pack A
+        A_interleaved = te.compute((batches,
+                                    M_padded // tile_rows_A,
+                                    K_padded // tile_cols_A,
+                                    tile_rows_A,
+                                    tile_cols_A),
+                                   lambda b, x, y, z, w: A[b,
+                                                           z + tile_rows_A * x,
+                                                           w + tile_cols_A * y],
+                                   name='A_interleaved')
+        # Compute C
+        C_interleaved = te.compute((batches,
+                                    M_padded // tile_rows_A,
+                                    N_transformed,
+                                    tile_rows_A,
+                                    tile_rows_B),
+                                   lambda b, x, y, w, z:
+                                   te.sum(A_interleaved[b,
+                                                        x,
+                                                        k//tile_cols_A,
+                                                        w,
+                                                        idxm(k, tile_cols_A)].astype('int32')*
+                                          B_interleaved_t[y,
+                                                          k//tile_cols_B,
+                                                          z,
+                                                          idxm(k, tile_cols_B)].astype('int32'),
+                                          axis=k),
+                                   name='C_interleaved')
+        # Unpack C
+        C = te.compute((batches, M, N),
+                       lambda b, x, y:
+                       C_interleaved[b,
+                                     x // tile_rows_A,
+                                     y // tile_rows_B,
+                                     idxm(x, tile_rows_A),
+                                     idxm(y, tile_rows_B)].astype(out_dtype),
+                       name="C")
+        zero = tvm.tir.const(0)
+    else:
+        # No need to pack/unpack
+        C = te.compute((batches,
+                        M_padded,
+                        N_padded),
+                       lambda b, x, y:
+                       te.sum(A[b, x, k].astype('int32')*
+                              B_interleaved_t[y//tile_rows_B,
+                                              k//tile_cols_B,
+                                              idxm(y, tile_rows_B),
+                                              idxm(k, tile_cols_B)].astype('int32'),
+                              axis=k),
+                       name='C')
+        zero = tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1] - \
+               tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1]
 
     # --- Produce the conv output
     out_shape = (batches, OH, OW, OC)
-    out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z), name="conv2d_gemm_output")
-
-    # Configuration space
-    x, y = cfg.axis(M_padded // 4), cfg.axis(K_padded // 16)
-    cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]])
-
-    outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
-    cfg.define_annotate(
-        "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec"
-    )
-    cfg.define_knob("gemm_quantized_unroll", [True, False])
-    cfg.define_knob("gemm_quantized_interleave", [True, False])
-
-    # Fallback configuration
-    if cfg.is_fallback:
-        cfg["reorder_gemm"] = ReorderEntity([0, 1])
-        cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
-        cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
-        cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
+    out = te.compute(out_shape, lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype),
+                     name='conv2d_gemm_output')
     return out
 
-
-# Schedules
 def schedule_conv2d_gemm(cfg, s, out, final_out):
-    """Create schedule for tensors"""
+    """ Schedule the conv2d_gemm interleaved strategy """
     C = out.op.input_tensors[0]
     C_interleaved = C.op.input_tensors[0]
     A_interleaved = C_interleaved.op.input_tensors[0]
@@ -192,8 +233,8 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
 
     # Computation(through tensorize)
     b, xo, yo, xi, yi = C_interleaved.op.axis
-    outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo])
-    s[C_interleaved].reorder(yi, xi)
+    outer_gemm, inner_gemm = cfg['reorder_gemm'].apply(s, C_interleaved, [xo, yo])
+
     b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm)
     s[C_interleaved].parallel(b_outer_gemm_fused)
     s[A_interleaved].compute_at(s[C_interleaved], b_outer_gemm_fused)
@@ -204,12 +245,29 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
 
     in_type = A_interleaved.dtype
     out_type = C.dtype
-    if is_aarch64_arm() and out_type == "int32":
+
+    k = C_interleaved.op.reduce_axis[0]
+    _, M, N = C.shape
+    if is_fast_int8_on_arm():
+        mmla = mmla_4x4_int8_int8_int32(in_type)
+        xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(xi,
+                                                                       yi,
+                                                                       x_factor=8,
+                                                                       y_factor=4)
+        k_outer, k_inner = s[C_interleaved].split(k, 4)
+        xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
+        s[C_interleaved].reorder(b_outer_gemm_fused, inner_gemm, xi_outer,
+                                 yi_outer, k_outer, xi_inner_outer, xi_inner_inner,
+                                 yi_inner, k_inner)
+        s[C_interleaved].tensorize(xi_inner_inner, mmla)
+        s[C_interleaved].unroll(xi_inner_outer)
+
+    elif is_aarch64_arm():
+        s[C_interleaved].reorder(yi, xi)
         K = A_interleaved_input.shape[2]
-        _, M, N = C.shape
-        assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
-        unroll = cfg["gemm_quantized_unroll"].val
-        interleave = cfg["gemm_quantized_interleave"].val
+        assert in_type in ['int8', 'uint8'], "Only int8 and uint8 gemm are supported"
+        unroll = cfg['gemm_quantized_unroll'].val
+        interleave = cfg['gemm_quantized_interleave'].val
         gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type)
         s[C_interleaved].pragma(
             b_outer_gemm_fused,
@@ -218,6 +276,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
         )
         s[C_interleaved].tensorize(yi, gemm)
 
+
     # Output transform
     if out != final_out:
         n, h, w, c = out.op.axis
@@ -225,3 +284,47 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
         s[C].compute_at(s[out], inner)
         s[out].vectorize(inner)
     return s
+
+def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out):
+    """ Schedule the conv2d_gemm hybrid strategy """
+    C = out.op.input_tensors[0]
+    A = C.op.input_tensors[0]
+    in_type = A.dtype
+
+    # Computation
+    b, x, y = C.op.axis
+    k, = C.op.reduce_axis
+    k_outer, k_inner = s[C].split(k, 16)
+    x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16)
+    s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner)
+    mmla = mmla_16x4_int8_int8_int32(in_type, rows=1)
+    s[C].unroll(x_inner)
+    s[C].tensorize(y_inner, mmla)
+    s[C].parallel(x_outer)
+
+    # Input transform
+    if A.op.name == "A_padded":
+        padding_A = True
+        data_im2col = A.op.input_tensors[0]
+    else:
+        padding_A = False
+        data_im2col = A
+
+    b, m, n = data_im2col.op.axis
+    if data_im2col.op.name == "data_im2col":
+        n_outer, n_inner = s[data_im2col].split(n, 16)
+        s[data_im2col].unroll(n_outer)
+        s[data_im2col].vectorize(n_inner)
+        s[data_im2col].parallel(m)
+    elif padding_A:
+        s[data_im2col].compute_inline()
+        s[A].compute_at(s[C], x_inner)
+    else:
+        s[data_im2col].compute_at(s[C], x_inner)
+
+    # Output transform
+    if out != final_out:
+        n, h, w, c = out.op.axis
+        _, inner = s[out].split(c, 4)
+        s[out].vectorize(inner)
+    return s
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 307f9e102acf..80ed67ca79a5 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -24,8 +24,9 @@
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 from .tensor_intrin import dot_int8_int8_int32
-from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform, schedule_conv2d_gemm
-
+from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform,\
+                         schedule_conv2d_gemm, schedule_conv2d_gemm_hybrid
+from .arm_utils import is_fast_int8_on_arm
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
     """
@@ -40,6 +41,19 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
             cfg, wkl, int32_lanes=2, num_int8_elements=4
         )
 
+def get_tiling_B(interleave_A):
+    """ Compute the tiling information for matrix B in A*B=C """
+    if is_fast_int8_on_arm():
+        # Two options in case dotprod is enabled
+        tile_cols_B = 4
+        tile_rows_B = 12 if interleave_A else 16
+    else:
+        # Single option if dotprod is not available
+        tile_rows_B = 4
+        tile_cols_B = 16
+
+    return tile_rows_B, tile_cols_B
+
 
 @autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu")
 def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype):
@@ -115,30 +129,27 @@ def traverse(op):
     return s
 
 
-@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu")
-def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
+def _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding,
+                                   dilation, out_dtype, interleave_A):
     N, IH, IW, IC = get_const_tuple(data.shape)
     KH, KW, _, OC = get_const_tuple(kernel.shape)
-    tile_rows = 4
-    tile_cols = 16
-    kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols)
-    return compute_conv2d_gemm_without_weight_transform(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC
-    )
+    tile_rows_B, tile_cols_B = get_tiling_B(interleave_A)
 
+    kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B)
+    return  compute_conv2d_gemm_without_weight_transform(cfg,
+                                                         data, kernel, strides, padding,
+                                                         dilation, out_dtype, (KH, KW),
+                                                         OC, interleave_A)
 
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_without_transform(
-    cfg, data, B, strides, padding, dilation, out_dtype, kernel_size=None, output_channels=None
-):
-    """Compute for conv2d_NHWC_quantized without weight transform."""
-    return compute_conv2d_gemm_without_weight_transform(
-        cfg, data, B, strides, padding, dilation, out_dtype, kernel_size, output_channels
-    )
+def _compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding,
+                                                     dilation, out_dtype, kernel_size=None,
+                                                     output_channels=None, interleave_A=False):
+    return  compute_conv2d_gemm_without_weight_transform(cfg, data, B, strides, padding,
+                                                         dilation, out_dtype, kernel_size,
+                                                         output_channels, interleave_A)
 
 
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu")
-def schedule_conv2d_NHWC_quantized(cfg, outs):
+def _schedule_conv2d_NHWC_quantized(cfg, outs, interleave_A):
     """Create schedule for tensors"""
     s = te.create_schedule([x.op for x in outs])
     # Vectorize the output and then inline all the rest
@@ -153,12 +164,58 @@ def _callback(op):
         """Traverse operators from computation graph"""
         if op.name == "conv2d_gemm_output":
             conv_out = op.output(0)
-            schedule_conv2d_gemm(cfg, s, conv_out, out)
+            if interleave_A:
+                schedule_conv2d_gemm(cfg, s, conv_out, out)
+            else:
+                schedule_conv2d_gemm_hybrid(cfg, s, conv_out, out)
             if out != conv_out:
                 s[conv_out].compute_at(s[out], inner)
             else:
                 C = conv_out.op.input_tensors[0]
-                s[C].compute_at(s[out], inner)
+                if interleave_A:
+                    s[C].compute_at(s[out], inner)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+#### Interleaved schedules
+@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu")
+def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """ Interface for interleaved compute_conv2d_NHWC_quantized"""
+    return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides,
+                                          padding, dilation, out_dtype, True)
+
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu")
+def compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding, dilation,
+                                                    out_dtype, kernel_size, output_channels):
+    """ Interface for interleaved compute_conv2d_NHWC_quantized_without_transform"""
+    return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel,
+                                                            strides, padding, dilation,
+                                                            out_dtype, kernel_size,
+                                                            output_channels, True)
+
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu")
+def schedule_conv2d_NHWC_quantized(cfg, outs):
+    """ Interface for interleaved schedule_conv2d_NHWC_quantized"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
+
+#### Hybrid schedules (A non interleaved, B is interleaved and transposed)
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid.arm_cpu")
+def compute_conv2d_NHWC_quantized_hybrid(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """ Interface for hybrid compute_conv2d_NHWC_quantized"""
+    return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides,
+                                          padding, dilation, out_dtype, False)
+
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu")
+def compute_conv2d_NHWC_quantized_hybrid_without_transform(cfg, data, kernel, strides, padding,
+                                                           dilation, out_dtype,
+                                                           kernel_size, output_channels):
+    """ Interface for hybrid compute_conv2d_NHWC_quantized_without_transform"""
+    return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding,
+                                                            dilation, out_dtype, kernel_size,
+                                                            output_channels, False)
+
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_hybrid.arm_cpu")
+def schedule_conv2d_NHWC_quantized_hybrid(cfg, outs):
+    """ Interface for hybrid schedule_conv2d_NHWC_quantized"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, False)
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index e87bdc47d0b0..ea55cd99e876 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -417,36 +417,19 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
     idxm = tvm.tir.indexmod
 
     k = te.reduce_axis((0, K), "k")
+    C = te.compute((te.var("m"), te.var("n")),
+                   lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype('int32') *
+                                       B[k // 16, y, idxm(k, 16)].astype('int32'),
+                                       axis=k), name="C")
 
-    C = te.compute(
-        (te.var("m"), te.var("n")),
-        lambda x, y: te.sum(
-            A[k // 16, x, idxm(k, 16)].astype(out_type)
-            * B[k // 16, y, idxm(k, 16)].astype(out_type),
-            axis=k,
-        ),
-        name="C",
-    )
+    a_buffer = tvm.tir.decl_buffer(A.shape, dtype=in_type, name="a_buffer",
+                                   offset_factor=1, strides=[te.var('sa_1'), te.var('sa_2'), 1])
 
-    a_buffer = tvm.tir.decl_buffer(
-        A.shape,
-        dtype=in_type,
-        name="a_buffer",
-        offset_factor=1,
-        strides=[te.var("sa_1"), te.var("sa_2"), 1],
-    )
-
-    b_buffer = tvm.tir.decl_buffer(
-        B.shape,
-        dtype=in_type,
-        name="b_buffer",
-        offset_factor=1,
-        strides=[te.var("sb_1"), te.var("sb_2"), 1],
-    )
+    b_buffer = tvm.tir.decl_buffer(B.shape, dtype=in_type, name="b_buffer",
+                                   offset_factor=1, strides=[te.var('sb_1'), te.var('sb_2'), 1])
 
-    c_buffer = tvm.tir.decl_buffer(
-        C.shape, dtype=out_type, name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1]
-    )
+    c_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="c_buffer",
+                                   offset_factor=1, strides=[te.var('sc'), 1])
 
     def _intrin_func(ins, outs):
         def _instr():
@@ -589,6 +572,200 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(i) = v[i:i+3]
+    replicated_v(i) = [vsub(i), vsub(i), vsub(i), vsub(i)]
+
+    Note that i can vary between 0 and 3
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin('int32x4', 'tir.reinterpret', vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, 'tir.reinterpret', vec_int32_shuffled)
+    return vec_int8_broadcast
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name='data')
+    kernel = te.placeholder((4, 4), dtype, name='kernel')
+    dtype_vec = dtype + 'x16'
+
+    k = te.reduce_axis((0, 4), name='k')
+    C = te.compute((te.var("rows"), 4),
+                   lambda i, j: te.sum(data[i, k].astype('int32') *
+                                       kernel[j, k].astype('int32'),
+                                       axis=k), name="C")
+
+    aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer",
+                                    offset_factor=1,
+                                    strides=[te.var('sa'), 1])
+    bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer",
+                                    offset_factor=1,
+                                    strides=[te.var('sb'), 1])
+    cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer",
+                                    offset_factor=1,
+                                    strides=[te.var('sc'), 1])
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x4')))
+                return ib.get()
+
+            vec_a = ins[0].vload([0, 0])
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the dot product
+            for i in range(0, 4):
+                vec_c = outs[0].vload([i, 0], 'int32x4')
+                vdot = tvm.tir.call_llvm_intrin(
+                    'int32x4',
+                    'llvm.aarch64.neon.sdot',
+                    tvm.tir.const(3, 'uint32'),
+                    vec_c, vec_b, vec_aa[i])
+
+                # Store the result
+                ib.emit(outs[0].vstore([i, 0], vdot))
+
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer},
+        default_buffer_params=buffer_params)
+
+def mmla_16x4_int8_int8_int32(dtype, rows):
+    """
+    Int8 16x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[rows][4] and
+    B[4][16] and produces a rowsx16 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_16x4_int8_int8_int32(int8 A[rows][4], int8 B[4][16], int32 output[rows][16]){
+            for (int i = 0; i < rows; i++){
+                for (int j = 0; i < 16; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * A is not interleaved, but used in its native form
+    This function returns a TensorIntrin that can be used to tensorize a schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((rows, 16), dtype, name='data')
+    kernel = te.placeholder((4, 16, 4), dtype, name='kernel')
+    dtype_vec = dtype + 'x16'
+    idxm = tvm.tir.indexmod
+    k = te.reduce_axis((0, 16), name='k')
+    C = te.compute((rows, 16),
+                   lambda i, j: te.sum(data[i, k].astype('int32') *
+                                       kernel[k//4, j, idxm(k, 4)].astype('int32'),
+                                       axis=k), name="C")
+
+    aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer",
+                                    offset_factor=1,
+                                    strides=[te.var('sa'), 1])
+    bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer",
+                                    offset_factor=1,
+                                    strides=[te.var('sb0'), te.var('sb1'), 1])
+    cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer",
+                                    offset_factor=1,
+                                    strides=[te.var('sc'), 1])
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, rows):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x16')))
+                return ib.get()
+
+            for k in range(0, rows):
+                vec_a = ins[0].vload([k, 0], dtype_vec)
+
+                for j in range(0, 4):
+                    for i in range(0, 4):
+                        vec_aa = select_word(vec_a, i, dtype_vec)
+                        vec_b = ins[1].vload([i, 4*j, 0], dtype_vec)
+                        vec_c = outs[0].vload([k, 4*j], 'int32x4')
+                        vdot = tvm.tir.call_llvm_intrin(
+                            'int32x4',
+                            'llvm.aarch64.neon.sdot',
+                            tvm.tir.const(3, 'uint32'),
+                            vec_c, vec_b, vec_aa)
+                        ib.emit(outs[0].vstore([k, 4*j], vdot))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer},
+        default_buffer_params=buffer_params)
+
 def _q_multiply_shift_arm(op):
     """
     Implementation of q_multiply_shift_arm through arm intrinsics

From 8b69ffd1e76bdf56e044f247a439a96fe044bc92 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 10 Sep 2020 19:52:45 +0100
Subject: [PATCH 02/10] Add back nhwc_spatial_pack strategy as default

Change-Id: I8b1826a7ae1d742956296e8d157da19955a4942c
---
 python/tvm/relay/op/strategy/arm_cpu.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 77e9a010629e..78251c89bd6f 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -147,13 +147,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
                         name="conv2d_NHWC_quantized.arm_cpu")
-                # TODO
-                # This strategy errors out when tuning. Let us comment it out
-                # but not remove.
-                # strategy.add_implementation(
-                #     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
-                #     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                #     name="conv2d_nhwc_spatial_pack.arm_cpu")
+                if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]):
+                    # TODO
+                    # This strategy errors out for quantized data types when tuning.
+                    # Let's use this only for non-aarch64 or non-quantized cases
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+                        name="conv2d_nhwc_spatial_pack.arm_cpu")
             else:
                 raise RuntimeError(
                     "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout)

From 16292e921c16c6f37cd1012e0cd3a7779ae0c850 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 16 Sep 2020 11:13:58 +0100
Subject: [PATCH 03/10] Fix linting through Black

Change-Id: Ic74ef5461a90bca9f4d4980a214137e384d5f923
---
 python/tvm/relay/op/strategy/arm_cpu.py    |  17 +-
 python/tvm/topi/arm_cpu/arm_utils.py       |   5 +-
 python/tvm/topi/arm_cpu/conv2d_alter_op.py |  60 ++++---
 python/tvm/topi/arm_cpu/conv2d_gemm.py     | 190 +++++++++++----------
 python/tvm/topi/arm_cpu/conv2d_int8.py     | 101 +++++++----
 python/tvm/topi/arm_cpu/tensor_intrin.py   | 172 +++++++++++--------
 6 files changed, 326 insertions(+), 219 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 78251c89bd6f..9fe6e28e6f84 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -141,12 +141,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid),
-                        name="conv2d_NHWC_quantized_hybrid.arm_cpu")
+                        name="conv2d_NHWC_quantized_hybrid.arm_cpu",
+                    )
                 if is_aarch64 and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-                        name="conv2d_NHWC_quantized.arm_cpu")
+                        name="conv2d_NHWC_quantized.arm_cpu",
+                    )
                 if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]):
                     # TODO
                     # This strategy errors out for quantized data types when tuning.
@@ -154,7 +156,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                        name="conv2d_nhwc_spatial_pack.arm_cpu")
+                        name="conv2d_nhwc_spatial_pack.arm_cpu",
+                    )
             else:
                 raise RuntimeError(
                     "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout)
@@ -336,15 +339,17 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
 
     interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform
     hybrid_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid_without_transform
-    if layout == "NHWC" and data.dtype in ['int8', 'uint8']:
+    if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(interleaved_compute),
             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-            name="conv2d_NHWC_quantized_without_transform.arm_cpu")
+            name="conv2d_NHWC_quantized_without_transform.arm_cpu",
+        )
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(hybrid_compute),
             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid),
-            name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu")
+            name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu",
+        )
     else:
         raise RuntimeError(
             "Unsupported conv2d_NHWC_quantized_without_transform layout {0}"
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index 35e43a36562e..b9c7010d6ee3 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -18,12 +18,15 @@
 """Arm target utility functions"""
 
 import tvm
+
+
 def is_fast_int8_on_arm():
     """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
     target = tvm.target.Target.current(allow_none=False)
     return "+v8.2a" in target.mattr and "+dotprod" in target.mattr
 
+
 def is_aarch64_arm():
     """ Checks whether we are compiling for an AArch64 target. """
     target = tvm.target.Target.current(allow_none=False)
-    return 'aarch64' in target.attrs.get("mtriple", "")
+    return "aarch64" in target.attrs.get("mtriple", "")
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index bb65cb2d4664..d669fa695e58 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -31,12 +31,17 @@
 
 logger = logging.getLogger("topi")
 
+
 def interleave_transpose_B(inputs, data, kernel, interleave_A):
     """ Return the new placeholder and the expression that represent
     the matrix B transposed and interleaved"""
 
-    assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
-            data.dtype == 'uint8' and kernel.dtype == 'uint8')
+    assert (
+        data.dtype == "int8"
+        and kernel.dtype == "int8"
+        or data.dtype == "uint8"
+        and kernel.dtype == "uint8"
+    )
 
     KH, KW, IC, OC = get_const_tuple(kernel.shape)
     K = KH * KW * IC
@@ -59,13 +64,12 @@ def interleave_transpose_B(inputs, data, kernel, interleave_A):
 
     N_padded = N + pad_N
     K_padded = K + pad_K
-    new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1],
-                                                                    tile_rows_B,
-                                                                    tile_cols_B)
-    new_kernel = te.placeholder((N_padded // tile_rows_B,
-                                 K_padded // tile_cols_B,
-                                 tile_rows_B,
-                                 tile_cols_B), kernel.dtype)
+    new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(
+        inputs[1], tile_rows_B, tile_cols_B
+    )
+    new_kernel = te.placeholder(
+        (N_padded // tile_rows_B, K_padded // tile_cols_B, tile_rows_B, tile_cols_B), kernel.dtype
+    )
     return new_kernel, new_kernel_expr
 
 
@@ -322,30 +326,32 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         KH, KW, IC, OC = get_const_tuple(kernel.shape)
         N = OC
         new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu"
-        new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data,
-                                                             kernel, interleave_A=True)
-        new_workload = autotvm.task.args_to_workload([data, new_kernel,
-                                                      strides, padding, dilation,
-                                                      out_dtype, (KH, KW), OC],
-                                                     new_workload_name)
+        new_kernel, new_kernel_expr = interleave_transpose_B(
+            inputs, data, kernel, interleave_A=True
+        )
+        new_workload = autotvm.task.args_to_workload(
+            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
+            new_workload_name,
+        )
         dispatch_ctx.update(target, new_workload, cfg)
 
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0],
-                                                                     new_kernel_expr,
-                                                                     **new_attrs)
+        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
+            inputs[0], new_kernel_expr, **new_attrs
+        )
     if topi_tmpl == "conv2d_NHWC_quantized_hybrid.arm_cpu":
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, IC, OC = get_const_tuple(kernel.shape)
         N = OC
         new_workload_name = "conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu"
-        new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data,
-                                                             kernel, interleave_A=False)
-        new_workload = autotvm.task.args_to_workload([data, new_kernel,
-                                                      strides, padding, dilation,
-                                                      out_dtype, (KH, KW), OC],
-                                                     new_workload_name)
+        new_kernel, new_kernel_expr = interleave_transpose_B(
+            inputs, data, kernel, interleave_A=False
+        )
+        new_workload = autotvm.task.args_to_workload(
+            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
+            new_workload_name,
+        )
         dispatch_ctx.update(target, new_workload, cfg)
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0],
-                                                                     new_kernel_expr,
-                                                                     **new_attrs)
+        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
+            inputs[0], new_kernel_expr, **new_attrs
+        )
     return None
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 14a192b38821..09c53001963f 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -23,44 +23,53 @@
 from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity
 from ..util import get_const_tuple, get_const_int
 from ..nn.util import get_pad_tuple
-from .tensor_intrin import gemm_quantized, gemm_quantized_impl,\
-                           mmla_4x4_int8_int8_int32, mmla_16x4_int8_int8_int32
+from .tensor_intrin import (
+    gemm_quantized,
+    gemm_quantized_impl,
+    mmla_4x4_int8_int8_int32,
+    mmla_16x4_int8_int8_int32,
+)
 from .arm_utils import is_aarch64_arm, is_fast_int8_on_arm
 
+
 def configure_knobs(cfg, M, K):
     """ Configure auto-tuning knobs for the interleaved strategy """
 
     x, y = cfg.axis(M // 4), cfg.axis(K // 16)
-    cfg.define_reorder('reorder_gemm',
-                       [x, y],
-                       policy='candidate',
-                       candidate=[[x, y],
-                                  [y, x]])
+    cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]])
 
     outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
-    cfg.define_annotate("A_interleaved_unroll_vec",
-                        [outer_loop, inner_loop],
-                        policy="try_unroll_vec")
+    cfg.define_annotate(
+        "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec"
+    )
 
     # Fallback configuration
     if cfg.is_fallback:
-        cfg['reorder_gemm'] = ReorderEntity([0, 1])
-        cfg['A_interleaved_unroll_vec'] = AnnotateEntity(["unroll", "vec"])
+        cfg["reorder_gemm"] = ReorderEntity([0, 1])
+        cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
 
     if not is_fast_int8_on_arm():
-        cfg.define_knob('gemm_quantized_unroll', [True, False])
-        cfg.define_knob('gemm_quantized_interleave', [True, False])
+        cfg.define_knob("gemm_quantized_unroll", [True, False])
+        cfg.define_knob("gemm_quantized_interleave", [True, False])
 
         if cfg.is_fallback:
-            cfg['gemm_quantized_unroll'] = OtherOptionEntity(False)
-            cfg['gemm_quantized_interleave'] = OtherOptionEntity(True)
+            cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
+            cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
+
 
 # Compute function
-def compute_conv2d_gemm_without_weight_transform(cfg,
-                                                 data, B_interleaved_t,
-                                                 strides, padding, dilation,
-                                                 out_dtype, kernel_size,
-                                                 output_channels, interleave_A):
+def compute_conv2d_gemm_without_weight_transform(
+    cfg,
+    data,
+    B_interleaved_t,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    kernel_size,
+    output_channels,
+    interleave_A,
+):
     """Compute conv2d by transforming the input,
     executing GEMM and transforming the output back"""
     batches, IH, IW, IC = get_const_tuple(data.shape)
@@ -101,11 +110,16 @@ def compute_conv2d_gemm_without_weight_transform(cfg,
     if K_AREA == 1:
         A = tvm.topi.reshape(data_pad, A_shape)
     else:
-        A = te.compute(A_shape, lambda n, x, y:
-                       data_pad[n,
-                                HSTR * (x // OW) + dilation_h * ((y // IC) // KW),
-                                WSTR * (x % OW) + dilation_w * ((y // IC) % KW), y % IC],
-                       name='data_im2col')
+        A = te.compute(
+            A_shape,
+            lambda n, x, y: data_pad[
+                n,
+                HSTR * (x // OW) + dilation_h * ((y // IC) // KW),
+                WSTR * (x % OW) + dilation_w * ((y // IC) % KW),
+                y % IC,
+            ],
+            name="data_im2col",
+        )
 
     # --- Pad if necessary
     N_transformed = B_interleaved_t.shape[0]
@@ -146,65 +160,58 @@ def compute_conv2d_gemm_without_weight_transform(cfg,
         configure_knobs(cfg, M_padded, K_padded)
 
         # Pack A
-        A_interleaved = te.compute((batches,
-                                    M_padded // tile_rows_A,
-                                    K_padded // tile_cols_A,
-                                    tile_rows_A,
-                                    tile_cols_A),
-                                   lambda b, x, y, z, w: A[b,
-                                                           z + tile_rows_A * x,
-                                                           w + tile_cols_A * y],
-                                   name='A_interleaved')
+        A_interleaved = te.compute(
+            (batches, M_padded // tile_rows_A, K_padded // tile_cols_A, tile_rows_A, tile_cols_A),
+            lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y],
+            name="A_interleaved",
+        )
         # Compute C
-        C_interleaved = te.compute((batches,
-                                    M_padded // tile_rows_A,
-                                    N_transformed,
-                                    tile_rows_A,
-                                    tile_rows_B),
-                                   lambda b, x, y, w, z:
-                                   te.sum(A_interleaved[b,
-                                                        x,
-                                                        k//tile_cols_A,
-                                                        w,
-                                                        idxm(k, tile_cols_A)].astype('int32')*
-                                          B_interleaved_t[y,
-                                                          k//tile_cols_B,
-                                                          z,
-                                                          idxm(k, tile_cols_B)].astype('int32'),
-                                          axis=k),
-                                   name='C_interleaved')
+        C_interleaved = te.compute(
+            (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B),
+            lambda b, x, y, w, z: te.sum(
+                A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32")
+                * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"),
+                axis=k,
+            ),
+            name="C_interleaved",
+        )
         # Unpack C
-        C = te.compute((batches, M, N),
-                       lambda b, x, y:
-                       C_interleaved[b,
-                                     x // tile_rows_A,
-                                     y // tile_rows_B,
-                                     idxm(x, tile_rows_A),
-                                     idxm(y, tile_rows_B)].astype(out_dtype),
-                       name="C")
+        C = te.compute(
+            (batches, M, N),
+            lambda b, x, y: C_interleaved[
+                b, x // tile_rows_A, y // tile_rows_B, idxm(x, tile_rows_A), idxm(y, tile_rows_B)
+            ].astype(out_dtype),
+            name="C",
+        )
         zero = tvm.tir.const(0)
     else:
         # No need to pack/unpack
-        C = te.compute((batches,
-                        M_padded,
-                        N_padded),
-                       lambda b, x, y:
-                       te.sum(A[b, x, k].astype('int32')*
-                              B_interleaved_t[y//tile_rows_B,
-                                              k//tile_cols_B,
-                                              idxm(y, tile_rows_B),
-                                              idxm(k, tile_cols_B)].astype('int32'),
-                              axis=k),
-                       name='C')
-        zero = tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1] - \
-               tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1]
+        C = te.compute(
+            (batches, M_padded, N_padded),
+            lambda b, x, y: te.sum(
+                A[b, x, k].astype("int32")
+                * B_interleaved_t[
+                    y // tile_rows_B, k // tile_cols_B, idxm(y, tile_rows_B), idxm(k, tile_cols_B)
+                ].astype("int32"),
+                axis=k,
+            ),
+            name="C",
+        )
+        zero = (
+            tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
+            - tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
+        )
 
     # --- Produce the conv output
     out_shape = (batches, OH, OW, OC)
-    out = te.compute(out_shape, lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype),
-                     name='conv2d_gemm_output')
+    out = te.compute(
+        out_shape,
+        lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype),
+        name="conv2d_gemm_output",
+    )
     return out
 
+
 def schedule_conv2d_gemm(cfg, s, out, final_out):
     """ Schedule the conv2d_gemm interleaved strategy """
     C = out.op.input_tensors[0]
@@ -233,7 +240,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
 
     # Computation(through tensorize)
     b, xo, yo, xi, yi = C_interleaved.op.axis
-    outer_gemm, inner_gemm = cfg['reorder_gemm'].apply(s, C_interleaved, [xo, yo])
+    outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo])
 
     b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm)
     s[C_interleaved].parallel(b_outer_gemm_fused)
@@ -250,24 +257,31 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
     _, M, N = C.shape
     if is_fast_int8_on_arm():
         mmla = mmla_4x4_int8_int8_int32(in_type)
-        xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(xi,
-                                                                       yi,
-                                                                       x_factor=8,
-                                                                       y_factor=4)
+        xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
+            xi, yi, x_factor=8, y_factor=4
+        )
         k_outer, k_inner = s[C_interleaved].split(k, 4)
         xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
-        s[C_interleaved].reorder(b_outer_gemm_fused, inner_gemm, xi_outer,
-                                 yi_outer, k_outer, xi_inner_outer, xi_inner_inner,
-                                 yi_inner, k_inner)
+        s[C_interleaved].reorder(
+            b_outer_gemm_fused,
+            inner_gemm,
+            xi_outer,
+            yi_outer,
+            k_outer,
+            xi_inner_outer,
+            xi_inner_inner,
+            yi_inner,
+            k_inner,
+        )
         s[C_interleaved].tensorize(xi_inner_inner, mmla)
         s[C_interleaved].unroll(xi_inner_outer)
 
     elif is_aarch64_arm():
         s[C_interleaved].reorder(yi, xi)
         K = A_interleaved_input.shape[2]
-        assert in_type in ['int8', 'uint8'], "Only int8 and uint8 gemm are supported"
-        unroll = cfg['gemm_quantized_unroll'].val
-        interleave = cfg['gemm_quantized_interleave'].val
+        assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
+        unroll = cfg["gemm_quantized_unroll"].val
+        interleave = cfg["gemm_quantized_interleave"].val
         gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type)
         s[C_interleaved].pragma(
             b_outer_gemm_fused,
@@ -276,7 +290,6 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
         )
         s[C_interleaved].tensorize(yi, gemm)
 
-
     # Output transform
     if out != final_out:
         n, h, w, c = out.op.axis
@@ -285,6 +298,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
         s[out].vectorize(inner)
     return s
 
+
 def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out):
     """ Schedule the conv2d_gemm hybrid strategy """
     C = out.op.input_tensors[0]
@@ -293,7 +307,7 @@ def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out):
 
     # Computation
     b, x, y = C.op.axis
-    k, = C.op.reduce_axis
+    (k,) = C.op.reduce_axis
     k_outer, k_inner = s[C].split(k, 16)
     x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16)
     s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner)
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 80ed67ca79a5..022e32704b3c 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -24,10 +24,14 @@
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 from .tensor_intrin import dot_int8_int8_int32
-from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform,\
-                         schedule_conv2d_gemm, schedule_conv2d_gemm_hybrid
+from .conv2d_gemm import (
+    compute_conv2d_gemm_without_weight_transform,
+    schedule_conv2d_gemm,
+    schedule_conv2d_gemm_hybrid,
+)
 from .arm_utils import is_fast_int8_on_arm
 
+
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
     """
     Get default int8 schedule config for the workload
@@ -41,6 +45,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
             cfg, wkl, int32_lanes=2, num_int8_elements=4
         )
 
+
 def get_tiling_B(interleave_A):
     """ Compute the tiling information for matrix B in A*B=C """
     if is_fast_int8_on_arm():
@@ -129,24 +134,43 @@ def traverse(op):
     return s
 
 
-def _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding,
-                                   dilation, out_dtype, interleave_A):
+def _compute_conv2d_NHWC_quantized(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, interleave_A
+):
     N, IH, IW, IC = get_const_tuple(data.shape)
     KH, KW, _, OC = get_const_tuple(kernel.shape)
     tile_rows_B, tile_cols_B = get_tiling_B(interleave_A)
 
     kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B)
-    return  compute_conv2d_gemm_without_weight_transform(cfg,
-                                                         data, kernel, strides, padding,
-                                                         dilation, out_dtype, (KH, KW),
-                                                         OC, interleave_A)
+    return compute_conv2d_gemm_without_weight_transform(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC, interleave_A
+    )
+
 
-def _compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding,
-                                                     dilation, out_dtype, kernel_size=None,
-                                                     output_channels=None, interleave_A=False):
-    return  compute_conv2d_gemm_without_weight_transform(cfg, data, B, strides, padding,
-                                                         dilation, out_dtype, kernel_size,
-                                                         output_channels, interleave_A)
+def _compute_conv2d_NHWC_quantized_without_transform(
+    cfg,
+    data,
+    B,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    kernel_size=None,
+    output_channels=None,
+    interleave_A=False,
+):
+    return compute_conv2d_gemm_without_weight_transform(
+        cfg,
+        data,
+        B,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        kernel_size,
+        output_channels,
+        interleave_A,
+    )
 
 
 def _schedule_conv2d_NHWC_quantized(cfg, outs, interleave_A):
@@ -178,42 +202,59 @@ def _callback(op):
     traverse_inline(s, outs[0].op, _callback)
     return s
 
+
 #### Interleaved schedules
 @autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu")
 def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """ Interface for interleaved compute_conv2d_NHWC_quantized"""
-    return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides,
-                                          padding, dilation, out_dtype, True)
+    return _compute_conv2d_NHWC_quantized(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, True
+    )
+
 
 @autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding, dilation,
-                                                    out_dtype, kernel_size, output_channels):
+def compute_conv2d_NHWC_quantized_without_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
+):
     """ Interface for interleaved compute_conv2d_NHWC_quantized_without_transform"""
-    return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel,
-                                                            strides, padding, dilation,
-                                                            out_dtype, kernel_size,
-                                                            output_channels, True)
+    return _compute_conv2d_NHWC_quantized_without_transform(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels, True
+    )
+
 
 @autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu")
 def schedule_conv2d_NHWC_quantized(cfg, outs):
     """ Interface for interleaved schedule_conv2d_NHWC_quantized"""
     return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
 
+
 #### Hybrid schedules (A non interleaved, B is interleaved and transposed)
 @autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid.arm_cpu")
 def compute_conv2d_NHWC_quantized_hybrid(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """ Interface for hybrid compute_conv2d_NHWC_quantized"""
-    return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides,
-                                          padding, dilation, out_dtype, False)
+    return _compute_conv2d_NHWC_quantized(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, False
+    )
+
 
 @autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_hybrid_without_transform(cfg, data, kernel, strides, padding,
-                                                           dilation, out_dtype,
-                                                           kernel_size, output_channels):
+def compute_conv2d_NHWC_quantized_hybrid_without_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
+):
     """ Interface for hybrid compute_conv2d_NHWC_quantized_without_transform"""
-    return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding,
-                                                            dilation, out_dtype, kernel_size,
-                                                            output_channels, False)
+    return _compute_conv2d_NHWC_quantized_without_transform(
+        cfg,
+        data,
+        kernel,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        kernel_size,
+        output_channels,
+        False,
+    )
+
 
 @autotvm.register_topi_schedule("conv2d_NHWC_quantized_hybrid.arm_cpu")
 def schedule_conv2d_NHWC_quantized_hybrid(cfg, outs):
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index ea55cd99e876..3d1c97139daa 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -417,19 +417,34 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
     idxm = tvm.tir.indexmod
 
     k = te.reduce_axis((0, K), "k")
-    C = te.compute((te.var("m"), te.var("n")),
-                   lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype('int32') *
-                                       B[k // 16, y, idxm(k, 16)].astype('int32'),
-                                       axis=k), name="C")
+    C = te.compute(
+        (te.var("m"), te.var("n")),
+        lambda x, y: te.sum(
+            A[k // 16, x, idxm(k, 16)].astype("int32") * B[k // 16, y, idxm(k, 16)].astype("int32"),
+            axis=k,
+        ),
+        name="C",
+    )
 
-    a_buffer = tvm.tir.decl_buffer(A.shape, dtype=in_type, name="a_buffer",
-                                   offset_factor=1, strides=[te.var('sa_1'), te.var('sa_2'), 1])
+    a_buffer = tvm.tir.decl_buffer(
+        A.shape,
+        dtype=in_type,
+        name="a_buffer",
+        offset_factor=1,
+        strides=[te.var("sa_1"), te.var("sa_2"), 1],
+    )
 
-    b_buffer = tvm.tir.decl_buffer(B.shape, dtype=in_type, name="b_buffer",
-                                   offset_factor=1, strides=[te.var('sb_1'), te.var('sb_2'), 1])
+    b_buffer = tvm.tir.decl_buffer(
+        B.shape,
+        dtype=in_type,
+        name="b_buffer",
+        offset_factor=1,
+        strides=[te.var("sb_1"), te.var("sb_2"), 1],
+    )
 
-    c_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="c_buffer",
-                                   offset_factor=1, strides=[te.var('sc'), 1])
+    c_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+    )
 
     def _intrin_func(ins, outs):
         def _instr():
@@ -585,13 +600,14 @@ def select_word(vec, lane, dtype_vec):
     Note that i can vary between 0 and 3
     """
     # Reinterpret vec_a as 4 int32 words
-    vec_int32 = tvm.tir.call_intrin('int32x4', 'tir.reinterpret', vec)
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
     # Broadcast the lane-th word
     vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
     # Convert back to uint8x16
-    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, 'tir.reinterpret', vec_int32_shuffled)
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", vec_int32_shuffled)
     return vec_int8_broadcast
 
+
 def mmla_4x4_int8_int8_int32(dtype):
     """
     Int8 4x4 matrix multiplication using sdot/udot instructions
@@ -626,32 +642,33 @@ def mmla_4x4_int8_int8_int32(dtype):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
-    data = te.placeholder((te.var("rows"), 4), dtype, name='data')
-    kernel = te.placeholder((4, 4), dtype, name='kernel')
-    dtype_vec = dtype + 'x16'
-
-    k = te.reduce_axis((0, 4), name='k')
-    C = te.compute((te.var("rows"), 4),
-                   lambda i, j: te.sum(data[i, k].astype('int32') *
-                                       kernel[j, k].astype('int32'),
-                                       axis=k), name="C")
-
-    aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer",
-                                    offset_factor=1,
-                                    strides=[te.var('sa'), 1])
-    bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer",
-                                    offset_factor=1,
-                                    strides=[te.var('sb'), 1])
-    cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer",
-                                    offset_factor=1,
-                                    strides=[te.var('sc'), 1])
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+    )
 
     def _intrin_func(ins, outs):
         def _instr(index):
             ib = tvm.tir.ir_builder.create()
             if index == 1:
                 for i in range(0, 4):
-                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x4')))
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4")))
                 return ib.get()
 
             vec_a = ins[0].vload([0, 0])
@@ -660,12 +677,15 @@ def _instr(index):
 
             # Execute the dot product
             for i in range(0, 4):
-                vec_c = outs[0].vload([i, 0], 'int32x4')
+                vec_c = outs[0].vload([i, 0], "int32x4")
                 vdot = tvm.tir.call_llvm_intrin(
-                    'int32x4',
-                    'llvm.aarch64.neon.sdot',
-                    tvm.tir.const(3, 'uint32'),
-                    vec_c, vec_b, vec_aa[i])
+                    "int32x4",
+                    "llvm.aarch64.neon.sdot",
+                    tvm.tir.const(3, "uint32"),
+                    vec_c,
+                    vec_b,
+                    vec_aa[i],
+                )
 
                 # Store the result
                 ib.emit(outs[0].vstore([i, 0], vdot))
@@ -677,8 +697,12 @@ def _instr(index):
 
     buffer_params = {"offset_factor": 1}
     return te.decl_tensor_intrin(
-        C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer},
-        default_buffer_params=buffer_params)
+        C.op,
+        _intrin_func,
+        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
 
 def mmla_16x4_int8_int8_int32(dtype, rows):
     """
@@ -714,32 +738,39 @@ def mmla_16x4_int8_int8_int32(dtype, rows):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
-    data = te.placeholder((rows, 16), dtype, name='data')
-    kernel = te.placeholder((4, 16, 4), dtype, name='kernel')
-    dtype_vec = dtype + 'x16'
+    data = te.placeholder((rows, 16), dtype, name="data")
+    kernel = te.placeholder((4, 16, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
     idxm = tvm.tir.indexmod
-    k = te.reduce_axis((0, 16), name='k')
-    C = te.compute((rows, 16),
-                   lambda i, j: te.sum(data[i, k].astype('int32') *
-                                       kernel[k//4, j, idxm(k, 4)].astype('int32'),
-                                       axis=k), name="C")
-
-    aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer",
-                                    offset_factor=1,
-                                    strides=[te.var('sa'), 1])
-    bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer",
-                                    offset_factor=1,
-                                    strides=[te.var('sb0'), te.var('sb1'), 1])
-    cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer",
-                                    offset_factor=1,
-                                    strides=[te.var('sc'), 1])
+    k = te.reduce_axis((0, 16), name="k")
+    C = te.compute(
+        (rows, 16),
+        lambda i, j: te.sum(
+            data[i, k].astype("int32") * kernel[k // 4, j, idxm(k, 4)].astype("int32"), axis=k
+        ),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape,
+        dtype,
+        name="bb_buffer",
+        offset_factor=1,
+        strides=[te.var("sb0"), te.var("sb1"), 1],
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+    )
 
     def _intrin_func(ins, outs):
         def _instr(index):
             ib = tvm.tir.ir_builder.create()
             if index == 1:
                 for i in range(0, rows):
-                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x16')))
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x16")))
                 return ib.get()
 
             for k in range(0, rows):
@@ -748,14 +779,17 @@ def _instr(index):
                 for j in range(0, 4):
                     for i in range(0, 4):
                         vec_aa = select_word(vec_a, i, dtype_vec)
-                        vec_b = ins[1].vload([i, 4*j, 0], dtype_vec)
-                        vec_c = outs[0].vload([k, 4*j], 'int32x4')
+                        vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec)
+                        vec_c = outs[0].vload([k, 4 * j], "int32x4")
                         vdot = tvm.tir.call_llvm_intrin(
-                            'int32x4',
-                            'llvm.aarch64.neon.sdot',
-                            tvm.tir.const(3, 'uint32'),
-                            vec_c, vec_b, vec_aa)
-                        ib.emit(outs[0].vstore([k, 4*j], vdot))
+                            "int32x4",
+                            "llvm.aarch64.neon.sdot",
+                            tvm.tir.const(3, "uint32"),
+                            vec_c,
+                            vec_b,
+                            vec_aa,
+                        )
+                        ib.emit(outs[0].vstore([k, 4 * j], vdot))
             return ib.get()
 
         # body, reset, update
@@ -763,8 +797,12 @@ def _instr(index):
 
     buffer_params = {"offset_factor": 1}
     return te.decl_tensor_intrin(
-        C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer},
-        default_buffer_params=buffer_params)
+        C.op,
+        _intrin_func,
+        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
 
 def _q_multiply_shift_arm(op):
     """

From 7c8c8671ccf0a0efe9aaee019af743903f0ad728 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 18 Sep 2020 14:56:21 +0100
Subject: [PATCH 04/10] Fix python linting

Change-Id: I5fb8a2ae4467a87bd3470f6b3753c074f9b7cc78
---
 python/tvm/topi/arm_cpu/conv2d_alter_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index d669fa695e58..eeea3d785c7c 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -33,7 +33,7 @@
 
 
 def interleave_transpose_B(inputs, data, kernel, interleave_A):
-    """ Return the new placeholder and the expression that represent
+    """Return the new placeholder and the expression that represent
     the matrix B transposed and interleaved"""
 
     assert (

From 8168589019d2893f49a335fd23497eee61a7387f Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 24 Sep 2020 21:59:41 +0100
Subject: [PATCH 05/10] Addressing review comments

Change-Id: I284b1f2c121051e672f548d6c6ee2a3267854e31
---
 python/tvm/relay/op/strategy/arm_cpu.py    | 32 +++++-----
 python/tvm/topi/arm_cpu/arm_utils.py       | 47 ++++++++++++++-
 python/tvm/topi/arm_cpu/conv2d_alter_op.py | 52 +++++++++-------
 python/tvm/topi/arm_cpu/conv2d_gemm.py     | 66 ++++++++++++--------
 python/tvm/topi/arm_cpu/conv2d_int8.py     | 70 ++++++++++------------
 python/tvm/topi/arm_cpu/tensor_intrin.py   | 20 ++++++-
 6 files changed, 181 insertions(+), 106 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 9fe6e28e6f84..6759a54d0b80 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -136,21 +136,21 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 )
             elif kernel_layout == "HWIO":
                 is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
-                has_dot_prod = topi.arm_cpu.arm_utils.is_fast_int8_on_arm()
+                has_dot_prod = topi.arm_cpu.arm_utils.is_dotprod_available()
                 if has_dot_prod and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid),
-                        name="conv2d_NHWC_quantized_hybrid.arm_cpu",
+                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
+                        name="conv2d_NHWC_quantized_native.arm_cpu",
                     )
                 if is_aarch64 and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-                        name="conv2d_NHWC_quantized.arm_cpu",
+                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
+                        name="conv2d_NHWC_quantized_interleaved.arm_cpu",
                     )
                 if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]):
-                    # TODO
+                    # TODO(@giuseros)
                     # This strategy errors out for quantized data types when tuning.
                     # Let's use this only for non-aarch64 or non-quantized cases
                     strategy.add_implementation(
@@ -337,18 +337,18 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
     data = inputs[0]
     strategy = _op.OpStrategy()
 
-    interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform
-    hybrid_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid_without_transform
+    interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved_without_transform
+    native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native_without_transform
     if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
         strategy.add_implementation(
-            wrap_compute_conv2d_gemm(interleaved_compute),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-            name="conv2d_NHWC_quantized_without_transform.arm_cpu",
+            wrap_compute_conv2d_gemm(native_compute),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
+            name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
         )
         strategy.add_implementation(
-            wrap_compute_conv2d_gemm(hybrid_compute),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid),
-            name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu",
+            wrap_compute_conv2d_gemm(interleaved_compute),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
+            name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
         )
     else:
         raise RuntimeError(
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index b9c7010d6ee3..c18312a7045c 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -20,7 +20,7 @@
 import tvm
 
 
-def is_fast_int8_on_arm():
+def is_dotprod_available():
     """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
     target = tvm.target.Target.current(allow_none=False)
     return "+v8.2a" in target.mattr and "+dotprod" in target.mattr
@@ -30,3 +30,48 @@ def is_aarch64_arm():
     """ Checks whether we are compiling for an AArch64 target. """
     target = tvm.target.Target.current(allow_none=False)
     return "aarch64" in target.attrs.get("mtriple", "")
+
+
+def get_tiling_B_interleaved_t(interleave_A):
+    """ Compute the tiling information for matrix B', where B'
+    is the transposed and interleaved version of matrix B in C=A*B.
+
+    The tiling information is chosen to maximize register usage during the
+    tile computation.
+
+    Please refer to:
+        - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+        - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+     In order to have more information
+
+    Parameters
+    ----------
+    interleave_A: bool
+                  determines if A is expected to be interleaved
+
+    Returns
+    ----------
+    tile_rows_B: the output tile rows of B'
+    tile_cols_B: the output tile columns of B'
+    """
+    if is_dotprod_available():
+        # The number of tile rows of B' vary depending on the
+        # strategy:
+        # * If we are interleaving A, then we select 12 columns from B'(i.e.,
+        #   12 rows from B).
+        # * If we are not interleaving A, then we select 16 columns from B'(i.e.,
+        #   16 rows from B).
+        tile_rows_B = 12 if interleave_A else 16
+
+        # Dot product instruction groups 2 (u)int16x8 vectors in
+        # groups of 4 and compute the dot product among those groups
+        # This means that the number of columns in a tile of B' (i.e.,  the
+        # rows of the original matrix B)  need to be 4.
+        tile_cols_B = 4
+    else:
+        # If dot product is not available, A must be interleaved. In this case
+        # we load 4 rows of B' (i.e., 4 columns of B). Each of them will contain 16 elements
+        tile_rows_B = 4
+        tile_cols_B = 16
+
+    return tile_rows_B, tile_cols_B
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index eeea3d785c7c..f81d8f593390 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -27,15 +27,31 @@
 from ..nn import conv2d_alter_layout
 from ..util import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
-from .arm_utils import is_fast_int8_on_arm
+from .arm_utils import get_tiling_B_interleaved_t
 
 logger = logging.getLogger("topi")
 
 
-def interleave_transpose_B(inputs, data, kernel, interleave_A):
-    """Return the new placeholder and the expression that represent
-    the matrix B transposed and interleaved"""
-
+def interleave_transpose_weights(inputs, data, kernel, interleave_A):
+    """ Transform the weight matrix by reshaping, interleaving and transposing it
+
+    Parameters
+    ----------
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    data :
+        Input shape and dtype
+    kernel :
+        Input shape and dtype
+    interleave_A: indicates if we expect matrix A to be interleaved
+
+    Returns
+    ----------
+    new_kernel : tvm.te.placeholder
+                 A placeholder with the new shape
+    new_kernel_expr : tvm.relay.Expr
+                The relay expression of the weights
+    """
     assert (
         data.dtype == "int8"
         and kernel.dtype == "int8"
@@ -47,12 +63,8 @@ def interleave_transpose_B(inputs, data, kernel, interleave_A):
     K = KH * KW * IC
     N = OC
 
-    if is_fast_int8_on_arm():
-        tile_rows_B = 12 if interleave_A else 16
-        tile_cols_B = 4
-    else:
-        tile_rows_B = 4
-        tile_cols_B = 16
+    # Get tiling information for the interleaved transposed version of B
+    tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A)
 
     pad_K = 0
     pad_N = 0
@@ -321,12 +333,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
-    if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu":
+    if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        KH, KW, IC, OC = get_const_tuple(kernel.shape)
-        N = OC
-        new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu"
-        new_kernel, new_kernel_expr = interleave_transpose_B(
+        KH, KW, _, OC = get_const_tuple(kernel.shape)
+        new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
+        new_kernel, new_kernel_expr = interleave_transpose_weights(
             inputs, data, kernel, interleave_A=True
         )
         new_workload = autotvm.task.args_to_workload(
@@ -338,12 +349,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         return relay.nn.contrib_conv2d_gemm_without_weight_transform(
             inputs[0], new_kernel_expr, **new_attrs
         )
-    if topi_tmpl == "conv2d_NHWC_quantized_hybrid.arm_cpu":
+    if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu":
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        KH, KW, IC, OC = get_const_tuple(kernel.shape)
-        N = OC
-        new_workload_name = "conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu"
-        new_kernel, new_kernel_expr = interleave_transpose_B(
+        KH, KW, _, OC = get_const_tuple(kernel.shape)
+        new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu"
+        new_kernel, new_kernel_expr = interleave_transpose_weights(
             inputs, data, kernel, interleave_A=False
         )
         new_workload = autotvm.task.args_to_workload(
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 09c53001963f..6896fd03289a 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -29,7 +29,7 @@
     mmla_4x4_int8_int8_int32,
     mmla_16x4_int8_int8_int32,
 )
-from .arm_utils import is_aarch64_arm, is_fast_int8_on_arm
+from .arm_utils import is_aarch64_arm, is_dotprod_available
 
 
 def configure_knobs(cfg, M, K):
@@ -48,7 +48,7 @@ def configure_knobs(cfg, M, K):
         cfg["reorder_gemm"] = ReorderEntity([0, 1])
         cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
 
-    if not is_fast_int8_on_arm():
+    if not is_dotprod_available():
         cfg.define_knob("gemm_quantized_unroll", [True, False])
         cfg.define_knob("gemm_quantized_interleave", [True, False])
 
@@ -76,8 +76,7 @@ def compute_conv2d_gemm_without_weight_transform(
 
     KH, KW = get_const_tuple(kernel_size)
     OC = get_const_int(output_channels)
-
-    K_AREA = KH * KW
+    kernel_area = KH * KW
 
     if isinstance(dilation, int):
         dilation_h = dilation_w = dilation
@@ -101,13 +100,13 @@ def compute_conv2d_gemm_without_weight_transform(
     else:
         data_pad = data
 
-    # --- Im2col
+    # Im2col
     M = OH * OW
-    K = IC * K_AREA
+    K = IC * kernel_area
     N = OC
 
     A_shape = (batches, M, K)
-    if K_AREA == 1:
+    if kernel_area == 1:
         A = tvm.topi.reshape(data_pad, A_shape)
     else:
         A = te.compute(
@@ -121,35 +120,48 @@ def compute_conv2d_gemm_without_weight_transform(
             name="data_im2col",
         )
 
-    # --- Pad if necessary
+    #  Pad if necessary
     N_transformed = B_interleaved_t.shape[0]
     tile_rows_B = B_interleaved_t.shape[2]
     tile_cols_B = B_interleaved_t.shape[3]
 
-    if is_fast_int8_on_arm() and interleave_A:
+    # Select the tiling strategy for A.
+    # The tiling information is chosen to maximize register usage during
+    # the tile computation.
+    #
+    # Please refer to:
+    #   - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+    #   - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+    # In order to have more information
+    #
+    if is_dotprod_available() and interleave_A:
+        # If dot product has been enabled, and we are interleaving A
+        # tile size should be 8x4
         tile_rows_A = 8
         tile_cols_A = 4
     else:
+        # If either there is no dot product or if we are using a native strategy
+        # tile size should be 4x16
         tile_rows_A = 4
         tile_cols_A = 16
 
-    pad_m = 0
-    pad_k = 0
+    pad_M = 0
+    pad_K = 0
 
     if M % tile_rows_A != 0:
-        pad_m = tile_rows_A - (M % tile_rows_A)
+        pad_M = tile_rows_A - (M % tile_rows_A)
 
     if K % tile_cols_A != 0:
-        pad_k = tile_cols_A - (K % tile_cols_A)
+        pad_K = tile_cols_A - (K % tile_cols_A)
 
-    M_padded = M + pad_m
-    K_padded = K + pad_k
+    M_padded = M + pad_M
+    K_padded = K + pad_K
     N_padded = N_transformed * tile_rows_B
 
     pad_before = (0, 0, 0)
-    pad_after = (0, pad_m, pad_k)
+    pad_after = (0, pad_M, pad_K)
 
-    if pad_m != 0 or pad_k != 0:
+    if pad_M != 0 or pad_K != 0:
         A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded")
 
     idxm = tvm.tir.indexmod
@@ -159,13 +171,13 @@ def compute_conv2d_gemm_without_weight_transform(
         # Configuration space
         configure_knobs(cfg, M_padded, K_padded)
 
-        # Pack A
+        # Pack the input data
         A_interleaved = te.compute(
             (batches, M_padded // tile_rows_A, K_padded // tile_cols_A, tile_rows_A, tile_cols_A),
             lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y],
             name="A_interleaved",
         )
-        # Compute C
+        # Execute GEMM
         C_interleaved = te.compute(
             (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B),
             lambda b, x, y, w, z: te.sum(
@@ -175,7 +187,7 @@ def compute_conv2d_gemm_without_weight_transform(
             ),
             name="C_interleaved",
         )
-        # Unpack C
+        # Unpack the result
         C = te.compute(
             (batches, M, N),
             lambda b, x, y: C_interleaved[
@@ -185,7 +197,7 @@ def compute_conv2d_gemm_without_weight_transform(
         )
         zero = tvm.tir.const(0)
     else:
-        # No need to pack/unpack
+        # No need to pack/unpack, execute GEMM directly
         C = te.compute(
             (batches, M_padded, N_padded),
             lambda b, x, y: te.sum(
@@ -197,12 +209,16 @@ def compute_conv2d_gemm_without_weight_transform(
             ),
             name="C",
         )
+
+        # We need to ensure that infer bound pass does not remove the padding
+        # which is necessary for the tensorizations to work. So we need to
+        # add a dummy reference to the padding area of the result
         zero = (
             tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
             - tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
         )
 
-    # --- Produce the conv output
+    # Reshape the result into a convolution output
     out_shape = (batches, OH, OW, OC)
     out = te.compute(
         out_shape,
@@ -212,7 +228,7 @@ def compute_conv2d_gemm_without_weight_transform(
     return out
 
 
-def schedule_conv2d_gemm(cfg, s, out, final_out):
+def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
     """ Schedule the conv2d_gemm interleaved strategy """
     C = out.op.input_tensors[0]
     C_interleaved = C.op.input_tensors[0]
@@ -255,7 +271,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
 
     k = C_interleaved.op.reduce_axis[0]
     _, M, N = C.shape
-    if is_fast_int8_on_arm():
+    if is_dotprod_available():
         mmla = mmla_4x4_int8_int8_int32(in_type)
         xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
             xi, yi, x_factor=8, y_factor=4
@@ -299,7 +315,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
     return s
 
 
-def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out):
+def schedule_conv2d_gemm_native(cfg, s, out, final_out):
     """ Schedule the conv2d_gemm hybrid strategy """
     C = out.op.input_tensors[0]
     A = C.op.input_tensors[0]
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 022e32704b3c..43fe80178bd3 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -26,10 +26,10 @@
 from .tensor_intrin import dot_int8_int8_int32
 from .conv2d_gemm import (
     compute_conv2d_gemm_without_weight_transform,
-    schedule_conv2d_gemm,
-    schedule_conv2d_gemm_hybrid,
+    schedule_conv2d_gemm_interleaved,
+    schedule_conv2d_gemm_native,
 )
-from .arm_utils import is_fast_int8_on_arm
+from .arm_utils import get_tiling_B_interleaved_t
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
@@ -46,20 +46,6 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
         )
 
 
-def get_tiling_B(interleave_A):
-    """ Compute the tiling information for matrix B in A*B=C """
-    if is_fast_int8_on_arm():
-        # Two options in case dotprod is enabled
-        tile_cols_B = 4
-        tile_rows_B = 12 if interleave_A else 16
-    else:
-        # Single option if dotprod is not available
-        tile_rows_B = 4
-        tile_cols_B = 16
-
-    return tile_rows_B, tile_cols_B
-
-
 @autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu")
 def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype):
     """Compute conv2d int8 with NCHWc layout"""
@@ -139,7 +125,7 @@ def _compute_conv2d_NHWC_quantized(
 ):
     N, IH, IW, IC = get_const_tuple(data.shape)
     KH, KW, _, OC = get_const_tuple(kernel.shape)
-    tile_rows_B, tile_cols_B = get_tiling_B(interleave_A)
+    tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A)
 
     kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B)
     return compute_conv2d_gemm_without_weight_transform(
@@ -189,9 +175,9 @@ def _callback(op):
         if op.name == "conv2d_gemm_output":
             conv_out = op.output(0)
             if interleave_A:
-                schedule_conv2d_gemm(cfg, s, conv_out, out)
+                schedule_conv2d_gemm_interleaved(cfg, s, conv_out, out)
             else:
-                schedule_conv2d_gemm_hybrid(cfg, s, conv_out, out)
+                schedule_conv2d_gemm_native(cfg, s, conv_out, out)
             if out != conv_out:
                 s[conv_out].compute_at(s[out], inner)
             else:
@@ -203,45 +189,49 @@ def _callback(op):
     return s
 
 
-#### Interleaved schedules
-@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu")
-def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """ Interface for interleaved compute_conv2d_NHWC_quantized"""
+# Interleaved schedules: those schedule will interleave the input data. The
+# weights are interleaved and transposed
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved.arm_cpu")
+def compute_conv2d_NHWC_quantized_interleaved(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved"""
     return _compute_conv2d_NHWC_quantized(
         cfg, data, kernel, strides, padding, dilation, out_dtype, True
     )
 
 
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_without_transform(
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu")
+def compute_conv2d_NHWC_quantized_interleaved_without_transform(
     cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
 ):
-    """ Interface for interleaved compute_conv2d_NHWC_quantized_without_transform"""
+    """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved_without_transform"""
     return _compute_conv2d_NHWC_quantized_without_transform(
         cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels, True
     )
 
 
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu")
-def schedule_conv2d_NHWC_quantized(cfg, outs):
-    """ Interface for interleaved schedule_conv2d_NHWC_quantized"""
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved.arm_cpu")
+def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs):
+    """ Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved"""
     return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
 
 
-#### Hybrid schedules (A non interleaved, B is interleaved and transposed)
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid.arm_cpu")
-def compute_conv2d_NHWC_quantized_hybrid(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """ Interface for hybrid compute_conv2d_NHWC_quantized"""
+# Native schedules: those schedule won't interleave A (which is left in its native form).
+# The weights are interleaved and transposed
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu")
+def compute_conv2d_NHWC_quantized_native(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """ Interface for native compute_conv2d_NHWC_quantized"""
     return _compute_conv2d_NHWC_quantized(
         cfg, data, kernel, strides, padding, dilation, out_dtype, False
     )
 
 
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_hybrid_without_transform(
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_native_without_transform.arm_cpu")
+def compute_conv2d_NHWC_quantized_native_without_transform(
     cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
 ):
-    """ Interface for hybrid compute_conv2d_NHWC_quantized_without_transform"""
+    """ Interface for compute_conv2d_NHWC_quantized_native_without_transform"""
     return _compute_conv2d_NHWC_quantized_without_transform(
         cfg,
         data,
@@ -256,7 +246,7 @@ def compute_conv2d_NHWC_quantized_hybrid_without_transform(
     )
 
 
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized_hybrid.arm_cpu")
-def schedule_conv2d_NHWC_quantized_hybrid(cfg, outs):
-    """ Interface for hybrid schedule_conv2d_NHWC_quantized"""
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native.arm_cpu")
+def schedule_conv2d_NHWC_quantized_native(cfg, outs):
+    """ Interface for native schedule_conv2d_NHWC_quantized"""
     return _schedule_conv2d_NHWC_quantized(cfg, outs, False)
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 3d1c97139daa..eaf9c209d6dd 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -594,10 +594,24 @@ def select_word(vec, lane, dtype_vec):
     The pseudo-code for this operation is:
 
     v = [x0, ..., x15]
-    vsub(i) = v[i:i+3]
-    replicated_v(i) = [vsub(i), vsub(i), vsub(i), vsub(i)]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
 
-    Note that i can vary between 0 and 3
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
     """
     # Reinterpret vec_a as 4 int32 words
     vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)

From 01d827a0b0a7ac894fd5c7e7a32adaa9751dc89b Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 25 Sep 2020 12:15:26 +0100
Subject: [PATCH 06/10] Fix black linting issues

Change-Id: I1813b0226b536aedee0dce9eeeba27aa2d95518b
---
 python/tvm/topi/arm_cpu/arm_utils.py       | 2 +-
 python/tvm/topi/arm_cpu/conv2d_alter_op.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index c18312a7045c..f5c94266bec6 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -33,7 +33,7 @@ def is_aarch64_arm():
 
 
 def get_tiling_B_interleaved_t(interleave_A):
-    """ Compute the tiling information for matrix B', where B'
+    """Compute the tiling information for matrix B', where B'
     is the transposed and interleaved version of matrix B in C=A*B.
 
     The tiling information is chosen to maximize register usage during the
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index f81d8f593390..a64bc413e0c6 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -33,7 +33,7 @@
 
 
 def interleave_transpose_weights(inputs, data, kernel, interleave_A):
-    """ Transform the weight matrix by reshaping, interleaving and transposing it
+    """Transform the weight matrix by reshaping, interleaving and transposing it
 
     Parameters
     ----------

From 82ce6474c645a92f60da48701a12e5219437e9b5 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 25 Sep 2020 16:24:18 +0100
Subject: [PATCH 07/10] Fixing failing test and adding tests for dot-product
 compilation

Change-Id: Ic040722abd5538fccb85af4de922394c939e7000
---
 python/tvm/topi/arm_cpu/tensor_intrin.py      |  15 ++-
 .../python/relay/test_pass_alter_op_layout.py |   2 +-
 .../topi/python/test_topi_conv2d_int8.py      | 105 +++++++++++-------
 3 files changed, 70 insertions(+), 52 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index eaf9c209d6dd..79b9bf8fd732 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -677,6 +677,8 @@ def mmla_4x4_int8_int8_int32(dtype):
         C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
     )
 
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot"
+
     def _intrin_func(ins, outs):
         def _instr(index):
             ib = tvm.tir.ir_builder.create()
@@ -685,7 +687,7 @@ def _instr(index):
                     ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4")))
                 return ib.get()
 
-            vec_a = ins[0].vload([0, 0])
+            vec_a = ins[0].vload([0, 0], dtype_vec)
             vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
             vec_b = ins[1].vload([0, 0], dtype_vec)
 
@@ -693,12 +695,7 @@ def _instr(index):
             for i in range(0, 4):
                 vec_c = outs[0].vload([i, 0], "int32x4")
                 vdot = tvm.tir.call_llvm_intrin(
-                    "int32x4",
-                    "llvm.aarch64.neon.sdot",
-                    tvm.tir.const(3, "uint32"),
-                    vec_c,
-                    vec_b,
-                    vec_aa[i],
+                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_b, vec_aa[i],
                 )
 
                 # Store the result
@@ -779,6 +776,8 @@ def mmla_16x4_int8_int8_int32(dtype, rows):
         C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
     )
 
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot"
+
     def _intrin_func(ins, outs):
         def _instr(index):
             ib = tvm.tir.ir_builder.create()
@@ -797,7 +796,7 @@ def _instr(index):
                         vec_c = outs[0].vload([k, 4 * j], "int32x4")
                         vdot = tvm.tir.call_llvm_intrin(
                             "int32x4",
-                            "llvm.aarch64.neon.sdot",
+                            llvm_intrin,
                             tvm.tir.const(3, "uint32"),
                             vec_c,
                             vec_b,
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 7b242c479bf1..ff6997e2f991 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -1016,7 +1016,7 @@ def _query_inside(self, target, workload):
         def update(self, target, workload, cfg):
             key = (str(target), workload)
             assert workload[2][1] == expected_workload_shape
-            assert workload[0] == "conv2d_NHWC_quantized_without_transform.arm_cpu"
+            assert workload[0] == "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
             self.memory[key] = cfg
 
     def alter_conv2d(attrs, inputs, tinfos, out_type):
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 238517e5ed75..b2f9835c3d66 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -56,48 +56,67 @@ def compile_conv2d_NHWC_gemm_int8_arm(
     W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
     bias = te.placeholder((num_filter,), name="bias", dtype="int8")
     dtype = "int32"
-    device = "llvm --device arm_cpu --mtriple aarch64-linux-gnu"
-
-    ctx = tvm.context(device, 0)
-    if not tvm.testing.device_enabled(device):
-        print("Skip because %s is not enabled" % device)
-        return
-    print("Compiling on arm AArch64 target: %s" % device)
-    with tvm.target.Target(device):
-        assert is_aarch64_arm(), "AArch64 target not recognized"
-
-        C = topi.arm_cpu.compute_conv2d_NHWC_quantized(
-            A, W, (stride, stride), padding, (dilation, dilation), dtype
-        )
+    devices = [
+        (
+            "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
+            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+        ),
+        (
+            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+        ),
+        (
+            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+            topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
+            topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
+        ),
+    ]
+
+    for device_tuple in devices:
+        device = device_tuple[0]
+        compute = device_tuple[1]
+        schedule = device_tuple[2]
+
+        ctx = tvm.context(device, 0)
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Compiling on arm AArch64 target: %s" % device)
+        with tvm.target.Target(device):
+            assert is_aarch64_arm(), "AArch64 target not recognized"
+
+            C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = schedule([C])
+
         if add_bias:
-            C = topi.add(C, bias)
-        if add_relu:
-            C = topi.nn.relu(C)
-        s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C])
-
-    if add_bias:
-        tvm.build(
-            s,
-            [A, W, bias, C],
-            device,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
-        func = tvm.build(
-            s,
-            [A, W, bias, C],
-            device,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
-    else:
-        func = tvm.build(
-            s,
-            [A, W, C],
-            device,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
+            tvm.build(
+                s,
+                [A, W, bias, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+            )
+            func = tvm.build(
+                s,
+                [A, W, bias, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+            )
+        else:
+            func = tvm.build(
+                s,
+                [A, W, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+            )
 
 
 def verify_conv2d_NHWC_gemm_int8(
@@ -155,14 +174,14 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
-            C = topi.arm_cpu.compute_conv2d_NHWC_quantized(
+            C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved(
                 A, W, (stride, stride), padding, (dilation, dilation), dtype
             )
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C])
+            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)

From 16b9c6e2eb0697dacd004c3a450e152b7f4234aa Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 28 Sep 2020 15:14:10 +0100
Subject: [PATCH 08/10] Fixing linting and review comments

Change-Id: If09e3baa514c85dc78d3c27c2ac2fa2e01773d89
---
 python/tvm/topi/arm_cpu/arm_utils.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index f5c94266bec6..fb47b29d6f6a 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -17,13 +17,36 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Arm target utility functions"""
 
+import re
 import tvm
 
 
+def get_arch_version(target_mattr):
+    """ Parse the LLVM target -mattr, and return
+    the architecture version in a decimal representation
+    (e.g., if -mattr=v8.4a, return 8.4)
+    """
+
+    arch_version = 8.0
+    m = re.compile(r"\+v(.*)\.(.*)a")
+    for attr in target_mattr:
+        match_obj = m.match(attr)
+        if match_obj:
+            major = int(match_obj.group(1))
+            minor = int(match_obj.group(2))
+            decimal = 10
+            if minor >= 10:
+                decimal = 100
+            arch_version = major + float(minor) / decimal
+
+    return arch_version
+
+
 def is_dotprod_available():
     """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
     target = tvm.target.Target.current(allow_none=False)
-    return "+v8.2a" in target.mattr and "+dotprod" in target.mattr
+    arch_version = get_arch_version(target.mattr)
+    return arch_version >= 8.4 or ((arch_version in (8.2, 8.3)) and "+dotprod" in target.mattr)
 
 
 def is_aarch64_arm():

From 8fd794a66a3bea14f67e137c5d8e1d421502cda3 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 29 Sep 2020 14:50:05 +0100
Subject: [PATCH 09/10] Fixing black linting and address comments

Change-Id: I857b28b6f9b23307d8c1eebc509de6ad2783c756
---
 python/tvm/topi/arm_cpu/arm_utils.py     |   2 +-
 python/tvm/topi/arm_cpu/conv2d_gemm.py   |  12 +--
 python/tvm/topi/arm_cpu/tensor_intrin.py | 111 +++++++++++++++++------
 3 files changed, 88 insertions(+), 37 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
index fb47b29d6f6a..7e0f566b96f4 100644
--- a/python/tvm/topi/arm_cpu/arm_utils.py
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -22,7 +22,7 @@
 
 
 def get_arch_version(target_mattr):
-    """ Parse the LLVM target -mattr, and return
+    """Parse the LLVM target -mattr, and return
     the architecture version in a decimal representation
     (e.g., if -mattr=v8.4a, return 8.4)
     """
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 6896fd03289a..b40fb89b5d33 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -26,8 +26,8 @@
 from .tensor_intrin import (
     gemm_quantized,
     gemm_quantized_impl,
-    mmla_4x4_int8_int8_int32,
-    mmla_16x4_int8_int8_int32,
+    gemm_acc_4x4_int8_int8_int32,
+    gemm_acc_nx16_int8_int8_int32,
 )
 from .arm_utils import is_aarch64_arm, is_dotprod_available
 
@@ -272,7 +272,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
     k = C_interleaved.op.reduce_axis[0]
     _, M, N = C.shape
     if is_dotprod_available():
-        mmla = mmla_4x4_int8_int8_int32(in_type)
+        gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
         xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
             xi, yi, x_factor=8, y_factor=4
         )
@@ -289,7 +289,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
             yi_inner,
             k_inner,
         )
-        s[C_interleaved].tensorize(xi_inner_inner, mmla)
+        s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
         s[C_interleaved].unroll(xi_inner_outer)
 
     elif is_aarch64_arm():
@@ -327,9 +327,9 @@ def schedule_conv2d_gemm_native(cfg, s, out, final_out):
     k_outer, k_inner = s[C].split(k, 16)
     x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16)
     s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner)
-    mmla = mmla_16x4_int8_int8_int32(in_type, rows=1)
+    gemm_acc = gemm_acc_nx16_int8_int8_int32(in_type, rows=1)
     s[C].unroll(x_inner)
-    s[C].tensorize(y_inner, mmla)
+    s[C].tensorize(y_inner, gemm_acc)
     s[C].parallel(x_outer)
 
     # Input transform
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 79b9bf8fd732..9ed4c591da0f 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -622,28 +622,28 @@ def select_word(vec, lane, dtype_vec):
     return vec_int8_broadcast
 
 
-def mmla_4x4_int8_int8_int32(dtype):
+def gemm_acc_4x4_int8_int8_int32(dtype):
     """
-    Int8 4x4 matrix multiplication using sdot/udot instructions
-    This function takes two arrays of int8 datatype -- A[4][4] and
-    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    Int8 4x4 matrix multiplication and accumulation using sdot/udot
+    instructions. This function takes two arrays of int8 datatype
+    -- A[4][4] and B[4][4] and produces a 4x4 matrix
+    which is equal to A*B.
+
     The pseudo code is as follows.
 
     .. code-block:: c
 
-        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 output[4][4]){
+        void gemm_acc_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 C[4][4]){
             for (int i = 0; i < 4; i++){
                 for (int j = 0; i < 4; i++){
-                    out[i][j] = 0;
                     for (int k = 0; k < 4; k++){
-                        out[i][j] += A[i][k] * B[j][k]
+                        C[i][j] += A[i][k] * B[j][k]
                     }
             }
         }
 
     Notes:
         * The rows of matrix B are transposed
-        * Matrix A is interleaved
     This function returns a TensorIntrin that can be used to tensorize a schedule.
 
     Parameters
@@ -656,22 +656,25 @@ def mmla_4x4_int8_int8_int32(dtype):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
-    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
-    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    # This needs to be a variable number of "rows" since TVM
+    # "thinks" I only need to compute one row because of
+    # padding
+    A = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    B = te.placeholder((4, 4), dtype, name="kernel")
     dtype_vec = dtype + "x16"
 
     k = te.reduce_axis((0, 4), name="k")
     C = te.compute(
         (te.var("rows"), 4),
-        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, k].astype("int32"), axis=k),
+        lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k),
         name="C",
     )
 
     aa_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
     )
     bb_buffer = tvm.tir.decl_buffer(
-        kernel.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
+        B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
     )
     cc_buffer = tvm.tir.decl_buffer(
         C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
@@ -686,16 +689,49 @@ def _instr(index):
                 for i in range(0, 4):
                     ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4")))
                 return ib.get()
-
+            # Load all the elements of tile A.
+            # vec_a = [a, b, c, d,
+            #          e, f, g, h,
+            #          i, l, m, n,
+            #          o, p, q, r,];
             vec_a = ins[0].vload([0, 0], dtype_vec)
+
+            # Replicate 4 times the i-th row of A. For instance,
+            # vec_a[0] = [a, b, c, d,
+            #             a, b, c, d,
+            #             a, b, c, d,
+            #             a, b, c, d,];
             vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+
+            # Load all the elements of B. Remember that B
+            # is transposed:
+            # vec_b = [0, 4, 8, 12,
+            #          1, 5, 9, 13,
+            #          2, 6, 10, 14,
+            #          3, 7, 11, 15,];
             vec_b = ins[1].vload([0, 0], dtype_vec)
 
             # Execute the dot product
             for i in range(0, 4):
                 vec_c = outs[0].vload([i, 0], "int32x4")
+                # Compute the product between the i-th row of A
+                # and all the rows of B. Remember that sdot/udot
+                # subdive the input vectors in 16 elements
+                # and then take the dot product among each group.
+                # The result is stored in a int32x4 register
+                #
+                # For instance, for i=0, we have:
+                # sdot(vec_aa[0], vec_b) = [a*0+b*4+c*8+d*12,
+                #                           a*1+b*5+c*9+d*13,
+                #                           a*2+b*6+c*10+d*14,
+                #                           a*3+b*7+c*11+d*15]
                 vdot = tvm.tir.call_llvm_intrin(
-                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_b, vec_aa[i],
+                    "int32x4",
+                    llvm_intrin,
+                    tvm.tir.const(3, "uint32"),
+                    vec_c,
+                    vec_b,
+                    vec_aa[i],
                 )
 
                 # Store the result
@@ -710,33 +746,36 @@ def _instr(index):
     return te.decl_tensor_intrin(
         C.op,
         _intrin_func,
-        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
         default_buffer_params=buffer_params,
     )
 
 
-def mmla_16x4_int8_int8_int32(dtype, rows):
+def gemm_acc_nx16_int8_int8_int32(dtype, rows):
     """
-    Int8 16x4 matrix multiplication using sdot/udot instructions
+    Int8 16x4 matrix multiplication and accumulation using sdot/udot instructions
     This function takes two arrays of int8 datatype -- A[rows][4] and
     B[4][16] and produces a rowsx16 matrix which is equal to A*B
     The pseudo code is as follows.
 
     .. code-block:: c
 
-        void mmla_16x4_int8_int8_int32(int8 A[rows][4], int8 B[4][16], int32 output[rows][16]){
+        void mmla_16x4_int8_int8_int32(int8 A[rows][16], int8 B[4][16][4], int32 output[rows][16]){
             for (int i = 0; i < rows; i++){
                 for (int j = 0; i < 16; i++){
-                    out[i][j] = 0;
-                    for (int k = 0; k < 4; k++){
-                        out[i][j] += A[i][k] * B[j][k]
+                    for (int k = 0; k < 16; k++){
+                        out[i][j] += A[i][k] * B[k//4][j][k%4]
                     }
+                }
             }
         }
 
     Notes:
         * The rows of matrix B are transposed
-        * A is not interleaved, but used in its native form
+        * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16
+          we need 4 tiles of B to compute a single row of the output. The first 4 values of
+          k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on
+
     This function returns a TensorIntrin that can be used to tensorize a schedule.
 
     Parameters
@@ -749,24 +788,24 @@ def mmla_16x4_int8_int8_int32(dtype, rows):
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
-    data = te.placeholder((rows, 16), dtype, name="data")
-    kernel = te.placeholder((4, 16, 4), dtype, name="kernel")
+    A = te.placeholder((rows, 16), dtype, name="data")
+    B = te.placeholder((4, 16, 4), dtype, name="kernel")
     dtype_vec = dtype + "x16"
     idxm = tvm.tir.indexmod
     k = te.reduce_axis((0, 16), name="k")
     C = te.compute(
         (rows, 16),
         lambda i, j: te.sum(
-            data[i, k].astype("int32") * kernel[k // 4, j, idxm(k, 4)].astype("int32"), axis=k
+            A[i, k].astype("int32") * B[k // 4, j, idxm(k, 4)].astype("int32"), axis=k
         ),
         name="C",
     )
 
     aa_buffer = tvm.tir.decl_buffer(
-        data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
     )
     bb_buffer = tvm.tir.decl_buffer(
-        kernel.shape,
+        B.shape,
         dtype,
         name="bb_buffer",
         offset_factor=1,
@@ -785,14 +824,26 @@ def _instr(index):
                 for i in range(0, rows):
                     ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x16")))
                 return ib.get()
-
+            # Iterate on the number of rows of the output
             for k in range(0, rows):
+                # Load 16 elements of A
+                # vec_a = [a, b, c, e, f, g, h, i, l, m, n, o, p, q, r,];
                 vec_a = ins[0].vload([k, 0], dtype_vec)
 
+                # Iterate over each column of the output
                 for j in range(0, 4):
+                    # Accumulate over each of the 4 (16x4) tiles contained in B
                     for i in range(0, 4):
+                        # As before, replicate a single 4-element group of A
                         vec_aa = select_word(vec_a, i, dtype_vec)
+                        # Load 4 rows (each rows with 4 elements) from B
+                        # vec_b = [0, 16, 32, 48,
+                        #          1, 17, 33, 49,
+                        #          2, 18, 34, 50,
+                        #          3, 19, 35, 51,];
                         vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec)
+                        # Store the result of the accumulation in the
+                        # correct part of the output
                         vec_c = outs[0].vload([k, 4 * j], "int32x4")
                         vdot = tvm.tir.call_llvm_intrin(
                             "int32x4",
@@ -812,7 +863,7 @@ def _instr(index):
     return te.decl_tensor_intrin(
         C.op,
         _intrin_func,
-        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
         default_buffer_params=buffer_params,
     )
 

From 0f8116a191f47b1942601944760b4a5e8b51e748 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 2 Oct 2020 18:27:07 +0100
Subject: [PATCH 10/10] Address review comments

Change-Id: I63d1a639d4a72abeb33148fd2868cd356ef84122
---
 python/tvm/topi/arm_cpu/tensor_intrin.py | 49 +++++++++++++++---------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 9ed4c591da0f..73cfacb62079 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -644,7 +644,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
 
     Notes:
         * The rows of matrix B are transposed
-    This function returns a TensorIntrin that can be used to tensorize a schedule.
+        * The tiling strategy is picked to maximize register usage.
 
     Parameters
     ----------
@@ -659,8 +659,8 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
     # This needs to be a variable number of "rows" since TVM
     # "thinks" I only need to compute one row because of
     # padding
-    A = te.placeholder((te.var("rows"), 4), dtype, name="data")
-    B = te.placeholder((4, 4), dtype, name="kernel")
+    A = te.placeholder((te.var("rows"), 4), dtype, name="A")
+    B = te.placeholder((4, 4), dtype, name="B")
     dtype_vec = dtype + "x16"
 
     k = te.reduce_axis((0, 4), name="k")
@@ -692,8 +692,8 @@ def _instr(index):
             # Load all the elements of tile A.
             # vec_a = [a, b, c, d,
             #          e, f, g, h,
-            #          i, l, m, n,
-            #          o, p, q, r,];
+            #          l, m, n, o,
+            #          p, q, r, s];
             vec_a = ins[0].vload([0, 0], dtype_vec)
 
             # Replicate 4 times the i-th row of A. For instance,
@@ -753,15 +753,15 @@ def _instr(index):
 
 def gemm_acc_nx16_int8_int8_int32(dtype, rows):
     """
-    Int8 16x4 matrix multiplication and accumulation using sdot/udot instructions
-    This function takes two arrays of int8 datatype -- A[rows][4] and
+    Int8 nx16 matrix multiplication and accumulation using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[n][4] and
     B[4][16] and produces a rowsx16 matrix which is equal to A*B
     The pseudo code is as follows.
 
     .. code-block:: c
 
-        void mmla_16x4_int8_int8_int32(int8 A[rows][16], int8 B[4][16][4], int32 output[rows][16]){
-            for (int i = 0; i < rows; i++){
+        void mmla_nx16_int8_int8_int32(int8 A[n][16], int8 B[4][16][4], int32 output[n][16]){
+            for (int i = 0; i < n; i++){
                 for (int j = 0; i < 16; i++){
                     for (int k = 0; k < 16; k++){
                         out[i][j] += A[i][k] * B[k//4][j][k%4]
@@ -775,21 +775,22 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
         * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16
           we need 4 tiles of B to compute a single row of the output. The first 4 values of
           k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on
-
-    This function returns a TensorIntrin that can be used to tensorize a schedule.
+        * The tiling strategy is picked to maximize register usage.
 
     Parameters
     ----------
     dtype: str, {"uint8", "int8"}
         Whether it works on unsigned int or signed int
+    rows: int
+        Number of of the output rows "n"
 
     Returns
     -------
     intrin : TensorIntrin
         The Arm TensorIntrin that can be used in tensorizing schedule
     """
-    A = te.placeholder((rows, 16), dtype, name="data")
-    B = te.placeholder((4, 16, 4), dtype, name="kernel")
+    A = te.placeholder((rows, 16), dtype, name="A")
+    B = te.placeholder((4, 16, 4), dtype, name="B")
     dtype_vec = dtype + "x16"
     idxm = tvm.tir.indexmod
     k = te.reduce_axis((0, 16), name="k")
@@ -827,24 +828,34 @@ def _instr(index):
             # Iterate on the number of rows of the output
             for k in range(0, rows):
                 # Load 16 elements of A
-                # vec_a = [a, b, c, e, f, g, h, i, l, m, n, o, p, q, r,];
+                # vec_a = [a, b, c, d, e, f, g, h, l, m, n, o, p, q, r, s];
                 vec_a = ins[0].vload([k, 0], dtype_vec)
 
-                # Iterate over each column of the output
+                # Iterate over each of the 4 rowsx4 tiles of the output
                 for j in range(0, 4):
                     # Accumulate over each of the 4 (16x4) tiles contained in B
                     for i in range(0, 4):
-                        # As before, replicate a single 4-element group of A
+                        # Replicate a single 4-element group of A (A[k, i:i+4])
                         vec_aa = select_word(vec_a, i, dtype_vec)
-                        # Load 4 rows (each rows with 4 elements) from B
+
+                        # Load 4 rows (each rows with 4 elements) from B (B[i:i+4, j:j+4])
                         # vec_b = [0, 16, 32, 48,
                         #          1, 17, 33, 49,
                         #          2, 18, 34, 50,
                         #          3, 19, 35, 51,];
                         vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec)
-                        # Store the result of the accumulation in the
-                        # correct part of the output
+
+                        # Accumulate in the correct part of the output
                         vec_c = outs[0].vload([k, 4 * j], "int32x4")
+
+                        # Compute the dot product between the rowsx4 tile
+                        # from A and the 4x4 tile from B
+                        #
+                        # For instance, for i=0, we have:
+                        # sdot(vec_aa[0], vec_b) = [a*0+b*16+c*32+d*48,
+                        #                           a*1+b*17+c*33+d*49,
+                        #                           a*2+b*18+c*34+d*50,
+                        #                           a*3+b*19+c*35+d*51]
                         vdot = tvm.tir.call_llvm_intrin(
                             "int32x4",
                             llvm_intrin,