From 5f2c4ea26721ef53f545172b60932bf7224058bb Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 22 Nov 2021 21:32:19 +0000
Subject: [PATCH] [microNPU] Fix incorrectly calculated stride when converting
 NHWC to NHCWB16

Fixes an issue that causes strides to be incorrectly calculated when the
number of channels in the input is less than 16 and involves a conversion
from NHWC to NHCWB16. This is due to TVM being 'too smart' when analyzing
generated TE and removing compute that is deemed unnecessary. Consequently,
strides over data are incorrectly calculated leading to an output
mismatch.

The PR uses a reduce sum operation to trick TE's data dependency
analyzer into looping over a whole block (16), rather than the number
of channels actually used (< 16). This causes the calculated strides to
be a multiple of 16 which is required for NHCWB16 format.

Change-Id: Ibf76a94a12cebf51fa716fcac1de932a271c4a6d
---
 .../relay/backend/contrib/ethosu/te/dma.py    |  11 +-
 .../relay/backend/contrib/ethosu/tir/dma.py   |  12 +-
 .../test_ethosu/test_replace_pooling.py       | 183 ++++++++++++++----
 3 files changed, 160 insertions(+), 46 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/dma.py b/python/tvm/relay/backend/contrib/ethosu/te/dma.py
index f80c7d963088..5d51c7bfae20 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/dma.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/dma.py
@@ -146,6 +146,12 @@ def write_compute(
 def convert_to_nhwc_compute(tensor: te.Tensor, layout: str, channels: int) -> te.Tensor:
     """Converts a tensor into NHWC layout if it's in NHWCB16 layout.
 
+    When the current layout is NHCWB16, a reduce sum operation is inserted
+    to ensure that the whole of the input tensor has a data dependency on
+    the copy operation. Without this, TVM removes compute that is deemed to
+    be unnecessary, which causes strides for the NPU to be calculated
+    incorrectly.
+
     Parameters
     ----------
     tensor : te.Tensor
@@ -167,9 +173,12 @@ def convert_to_nhwc_compute(tensor: te.Tensor, layout: str, channels: int) -> te
         "layout": layout,
     }
     if layout == "NHCWB16":
+        rc = te.reduce_axis((0, 16), name="rc")
         return te.compute(
             (tensor.shape[0], tensor.shape[1], tensor.shape[3], channels),
-            lambda nn, hh, ww, cc: tensor(nn, hh, te.indexdiv(cc, 16), ww, te.indexmod(cc, 16)),
+            lambda nn, hh, ww, cc: te.sum(
+                tensor(nn, hh, te.indexdiv(cc, 16), ww, te.indexmod(rc, 16)), axis=rc
+            ),
             name="ethosu_convert_to_nhwc",
             attrs=convert_to_nhwc_attrs,
         )
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
index a116e51c5b7c..46df20814eb5 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
@@ -94,10 +94,18 @@ def get_convert_to_nhwc_params(stmt):
         The pointer produced by the operation.
 
     """
-    _, body = get_op_attrs(stmt)
+    attrs, body = get_op_attrs(stmt)
     _, _, _, c, _, inner = get_outer_loops(body, "NHWC")
+
+    # Ignore the reduce sum operation inserted to ensure
+    # compute that is deemed uneccesary isn't removed by TVM.
+    if attrs["layout"] == "NHCWB16":
+        inner = inner.body
+        input_pointer = inner.value.b.buffer_var
+    else:
+        input_pointer = inner.value.buffer_var
+
     output_pointer = inner.buffer_var
-    input_pointer = inner.value.buffer_var
     return c.extent, input_pointer, output_pointer
 
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_pooling.py b/tests/python/contrib/test_ethosu/test_replace_pooling.py
index 79526ed527e8..ee72ffa4cb99 100644
--- a/tests/python/contrib/test_ethosu/test_replace_pooling.py
+++ b/tests/python/contrib/test_ethosu/test_replace_pooling.py
@@ -26,52 +26,18 @@
 from .infra import make_ethosu_pooling, get_pooling_args
 
 
-@pytest.mark.parametrize(
-    "ifm_shape, ofm_channels, ifm_layout, ofm_layout, rounding_mode",
-    [
-        ((1, 5, 9, 3), 3, "NHWC", "NHWC", "TFL"),
-        ((1, 8, 3, 9, 16), 40, "NHCWB16", "NHCWB16", "NATURAL"),
-        ((1, 8, 3, 9, 16), 40, "NHCWB16", "NHWC", "TRUNCATE"),
-        ((1, 8, 9, 40), 40, "NHWC", "NHCWB16", "TFL"),
-    ],
-)
-@pytest.mark.parametrize("pooling_type", ["AVG", "MAX"])
-@pytest.mark.parametrize("activation", ["NONE", "CLIP"])
-def test_pooling_single(
+def _create_serial_pooling(
     ifm_shape,
     ofm_channels,
     ifm_layout,
     ofm_layout,
+    pool_shape,
     pooling_type,
-    activation,
-    rounding_mode,
+    strides,
+    padding,
+    activation="NONE",
+    rounding_mode="TFL",
 ):
-    pool_shape = (3, 2)
-    strides = (1, 2)
-    padding = (1, 1, 1, 0)
-    ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
-    pooling = make_ethosu_pooling(
-        ifm,
-        pooling_type,
-        pool_shape,
-        ofm_channels,
-        strides,
-        padding,
-        activation,
-        ifm_layout,
-        ofm_layout,
-        rounding_mode,
-    )
-    func = relay.Function(relay.analysis.free_vars(pooling), pooling)
-    func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = lower_to_tir(func)
-    data = []
-
-    def _visit(stmt):
-        if isinstance(stmt, tvm.tir.Call):
-            data.append(get_pooling_args(stmt))
-
-    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _visit)
     if ifm_layout == "NHWC":
         ifm_stride_c = 1
         ifm_stride_w = ifm_shape[3]
@@ -80,7 +46,7 @@ def _visit(stmt):
         ofm_width = (ifm_shape[2] - pool_shape[1] + padding[1] + padding[1]) // strides[1] + 1
     else:
         ifm_stride_w = 16
-        ifm_stride_c = 16 * ifm_shape[3]
+        ifm_stride_c = 16 * ifm_shape[3] if ofm_channels >= 16 else 1
         ifm_stride_h = 16 * ifm_shape[2] * ifm_shape[3]
         ofm_height = (ifm_shape[1] - pool_shape[0] + padding[0] + padding[0]) // strides[0] + 1
         ofm_width = (ifm_shape[3] - pool_shape[1] + padding[1] + padding[1]) // strides[1] + 1
@@ -91,10 +57,10 @@ def _visit(stmt):
         ofm_stride_h = ofm_channels * ofm_width if ofm_height > 1 else 1
     else:
         ofm_stride_w = 16
-        ofm_stride_c = 16 * ofm_width
+        ofm_stride_c = 16 * ofm_width if ofm_channels >= 16 else 1
         ofm_stride_h = 16 * ofm_width * ((ofm_channels - 1) // 16 + 1)
 
-    serial_pooling = spec.SerialPooling(
+    return spec.SerialPooling(
         ifm=spec.SerialFeatureMap(
             data_type="int8",
             height=ifm_shape[1],
@@ -154,8 +120,139 @@ def _visit(stmt):
         upscale="NONE",
     )
 
+
+@pytest.mark.parametrize(
+    "ifm_shape, ofm_channels, ifm_layout, ofm_layout, rounding_mode",
+    [
+        ((1, 5, 9, 3), 3, "NHWC", "NHWC", "TFL"),
+        ((1, 8, 3, 9, 16), 40, "NHCWB16", "NHCWB16", "NATURAL"),
+        ((1, 8, 3, 9, 16), 40, "NHCWB16", "NHWC", "TRUNCATE"),
+        ((1, 8, 9, 40), 40, "NHWC", "NHCWB16", "TFL"),
+        ((1, 8, 9, 8), 8, "NHWC", "NHCWB16", "TFL"),
+    ],
+)
+@pytest.mark.parametrize("pooling_type", ["AVG", "MAX"])
+@pytest.mark.parametrize("activation", ["NONE", "CLIP"])
+def test_pooling_single(
+    ifm_shape,
+    ofm_channels,
+    ifm_layout,
+    ofm_layout,
+    pooling_type,
+    activation,
+    rounding_mode,
+):
+    pool_shape = (3, 2)
+    strides = (1, 2)
+    padding = (1, 1, 1, 0)
+    ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+    pooling = make_ethosu_pooling(
+        ifm,
+        pooling_type,
+        pool_shape,
+        ofm_channels,
+        strides,
+        padding,
+        activation,
+        ifm_layout,
+        ofm_layout,
+        rounding_mode,
+    )
+    func = relay.Function(relay.analysis.free_vars(pooling), pooling)
+    func = run_opt_pass(func, relay.transform.InferType())
+    mod, _ = lower_to_tir(func)
+    data = []
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.Call):
+            data.append(get_pooling_args(stmt))
+
+    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _visit)
+
+    serial_pooling = _create_serial_pooling(
+        ifm_shape,
+        ofm_channels,
+        ifm_layout,
+        ofm_layout,
+        pool_shape,
+        pooling_type,
+        strides,
+        padding,
+        activation,
+        rounding_mode,
+    )
     assert data[0] == ["ethosu_pooling"] + list(serial_pooling)
 
 
+def test_correct_stride_with_multiple_pooling():
+    """Testing a specific case of two pooling operations with NHWC inputs/outputs
+    but a NHCWB16 intermediate tensor. This lead to elements being accessed in the
+    wrong order by the NPU, due to incorrect stride values being calculated."""
+
+    ifm_shape = (1, 4, 4, 8)
+    ofm_channels = 8
+    pooling_type = "MAX"
+    pool_shape = (1, 1)
+    strides = (1, 1)
+    padding = (0, 0, 0, 0)
+
+    ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+    op = make_ethosu_pooling(
+        ifm,
+        pooling_type,
+        pool_shape,
+        ofm_channels,
+        strides,
+        padding,
+        ifm_layout="NHWC",
+        ofm_layout="NHCWB16",
+    )
+    op = make_ethosu_pooling(
+        op,
+        pooling_type,
+        pool_shape,
+        ofm_channels,
+        strides,
+        padding,
+        ifm_layout="NHCWB16",
+        ofm_layout="NHWC",
+    )
+    func = relay.Function(relay.analysis.free_vars(op), op)
+    func = run_opt_pass(func, relay.transform.InferType())
+    mod, _ = lower_to_tir(func)
+
+    data = []
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.Call):
+            data.append(get_pooling_args(stmt))
+
+    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _visit)
+
+    serial_pooling_1 = _create_serial_pooling(
+        [1, 4, 4, 8],
+        8,
+        "NHWC",
+        "NHCWB16",
+        pool_shape,
+        pooling_type,
+        strides,
+        padding,
+    )
+    serial_pooling_2 = _create_serial_pooling(
+        [1, 4, 1, 4, 16],
+        8,
+        "NHCWB16",
+        "NHWC",
+        pool_shape,
+        pooling_type,
+        strides,
+        padding,
+    )
+
+    assert data[0] == ["ethosu_pooling"] + list(serial_pooling_1)
+    assert data[1] == ["ethosu_pooling"] + list(serial_pooling_2)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])