From 46c0c536b40c230eb52be24468cd5a6c710fba6e Mon Sep 17 00:00:00 2001
From: Aleksei-grovety <113356454+Aleksei-grovety@users.noreply.github.com>
Date: Mon, 3 Apr 2023 16:04:41 +0400
Subject: [PATCH 1/2] [microNPU][ETHOSU] Add restrictions to convert to NHCWB16
 layout in LayoutOptimization pass

Before format NHCWB16 was used for operations performed by NPU, but there are limitations, for example ReduceSum requires NHWC input in case input tensor has type int32 or accelerator is Ethos_U65_512 https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/tags/3.7.0/ethosu/vela/graph_optimiser_util.py#126
---
 .../relay/backend/contrib/ethosu/codegen.py   | 90 +++++++++++++++----
 .../test_ethosu/test_layout_optimizer.py      | 38 ++++++++
 2 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index b07b260f1965..5a5f1478e16e 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -16,8 +16,9 @@
 # under the License.
 """Codegen for Arm(R) Ethos(TM)-U NPU"""
 from collections import defaultdict
-
 from typing import List, Callable
+
+from ethosu.vela import api as vapi
 import tvm
 from tvm import relay
 from tvm.relay.backend.contrib.ethosu.tir.compiler import LowerToTIR
@@ -30,7 +31,7 @@
     extract_memory_info,
 )
 from tvm.relay.backend.contrib.ethosu.legalize import LegalizeEthosU
-from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator, util
+from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator, util, vela_api
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 # pylint: disable=unused-import
@@ -143,20 +144,25 @@ def __call__(self, *args, **kwargs):
 
 
 class AnalyzeConsumers(ExprVisitor):
-    """Traverses the graph to determine consumers that are NPU operations. The
-    result is maintained in `npu_consumers`.
+    """Traverses the graph to determine consumers that are NPU operations and
+    which have restrictions to use NHCWB16 layout. The result is maintained in
+    `npu_consumers` and `restrictions`.
 
     Attributes
     ----------
     npu_consumers : Dict[tvm.relay.expr.Call, List[bool]]
         Mapping from NPU operation to list of boolean values that represent
         whether or not each consumer is an NPU operation.
+    restrictions : Dict[tvm.relay.expr.Call, List[bool]]
+        Mapping from NPU operation to list of boolean values that represent
+        whether or not operation has restrictions to use NHCWB16 layout.
     optimize_ops : Dict[str, Callable]
         A map from NPU operation name to function that creates NPU operation.
     """
 
     def __init__(self, optimize_ops):
         self.npu_consumers = defaultdict(list)
+        self.restrictions = defaultdict(list)
         self.optimize_ops = optimize_ops
         super().__init__()
 
@@ -174,6 +180,18 @@ def visit_call(self, call: relay.Call):
         for arg in args:
             if isinstance(arg, relay.Call) and arg.op.name in self.optimize_ops:
                 self.npu_consumers[arg].append(is_npu_consumer)
+                # ReduceSum requires NHWC input in case input tensor has type int32 or
+                # accelerator is Ethos_U65_512
+                # https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/tags/3.7.0/ethosu/vela/graph_optimiser_util.py#126
+                has_restrictions = (
+                    call.op.name == "contrib.ethosu.pooling"
+                    and call.attrs["pooling_type"] == "SUM"
+                    and (
+                        arg.checked_type.dtype == "int32"
+                        or vela_api.get_accelerator_config() == vapi.NpuAccelerator.Ethos_U65_512
+                    )
+                )
+                self.restrictions[arg].append(has_restrictions)
 
         super().visit_call(call)
 
@@ -185,11 +203,11 @@ class LayoutOptimization(ExprMutator):
     operation depends on the following:
 
     Check alter input layout: For each argument, if the producer is also an NPU operation and
-        its output is altered to brick format, then the input layout with respect to the current
-        argument is altered to brick format.
+        its output is altered to brick format and there are no restrictions, then the input layout
+        with respect to the current argument is altered to brick format.
 
-    Check alter output layout: If all consumers (child nodes) are an NPU operation, then the
-        output layout is altered to brick format.
+    Check alter output layout: If all consumers (child nodes) are an NPU operation and
+        there are no restrictions, then the output layout is altered to brick format.
 
     Note
     ----
@@ -198,15 +216,19 @@ class LayoutOptimization(ExprMutator):
 
     Attributes
     ----------
-    npu_consumers : Dict[tvm.relay.expr.Call, bool]
+    npu_consumers : Dict[tvm.relay.expr.Call, List[bool]]
         A map from current call to a list boolean values that state whether or not each consumer
         is an NPU operation.
+    restrictions : Dict[tvm.relay.expr.Call, List[bool]]
+        A map from current call to a list boolean values that state
+        whether or not operation has restrictions to use NHCWB16 layout.
     optimize_ops : Dict[str, Callable]
         A map from NPU operation name to function that creates NPU operation.
     """
 
-    def __init__(self, npu_consumers, optimize_ops):
+    def __init__(self, npu_consumers, restrictions, optimize_ops):
         self.npu_consumers = npu_consumers
+        self.restrictions = restrictions
         self.optimize_ops = optimize_ops
         super().__init__()
 
@@ -224,6 +246,39 @@ def alter_ethosu_op_layout(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Ca
         new_call : tvm.relay.expr.Call
             New call with altered layouts.
         """
+
+        def are_all_consumers_npu(call):
+            """
+            Check whether or not each consumer is an NPU operation.
+            Parameters
+            ----------
+            call : tvm.relay.expr.Call
+                The call pointing to an NPU operation.
+
+            Returns
+            -------
+            all_consumers_npu : bool
+                Whether each consumer is an NPU operation.
+            """
+            consumers = self.npu_consumers[call]
+            return consumers and all(consumers)
+
+        def check_restrictions(call):
+            """
+            Check if there are any restrictions for call to use NHCWB16 layout.
+            Parameters
+            ----------
+            call : tvm.relay.expr.Call
+                The call pointing to an NPU operation.
+
+            Returns
+            -------
+            any_restrictions : bool
+                Whether there are restrictions.
+            """
+            restrictions = self.restrictions[call]
+            return restrictions and any(restrictions)
+
         assert isinstance(call.attrs, tvm.ir.Attrs), (
             f"The attributes for operator '{call.op.name}' could not be "
             "found. Did you register the relay.attrs.Ethosu<opname>Attrs "
@@ -238,15 +293,16 @@ def alter_ethosu_op_layout(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Ca
             input_count += 1
             if arg not in self.npu_consumers:
                 continue
-            consumers = self.npu_consumers[arg]
-            parent_has_brick_output = consumers and all(consumers)
-            if parent_has_brick_output:
+            parent_has_brick_output = are_all_consumers_npu(arg)
+            parent_has_restrictions = check_restrictions(arg)
+            if parent_has_brick_output and not parent_has_restrictions:
                 layout_string = "ifm_layout" if input_count <= 1 else f"ifm{input_count}_layout"
                 new_attrs[layout_string] = "NHCWB16"
 
         # Check if we can rewrite the output layouts
-        consumers = self.npu_consumers[call]
-        if consumers and all(consumers):
+        has_brick_output = are_all_consumers_npu(call)
+        has_restrictions = check_restrictions(call)
+        if has_brick_output and not has_restrictions:
             new_attrs["ofm_layout"] = "NHCWB16"
 
         name = call.op.name
@@ -293,7 +349,9 @@ def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
 
         analyze = AnalyzeConsumers(optimize_ops)
         analyze.visit(func)
-        return LayoutOptimization(analyze.npu_consumers, optimize_ops).visit(func)
+        return LayoutOptimization(analyze.npu_consumers, analyze.restrictions, optimize_ops).visit(
+            func
+        )
 
     def __call__(self, *args, **kwargs):
         pass
diff --git a/tests/python/contrib/test_ethosu/test_layout_optimizer.py b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
index 9cc3637fdf8c..05b9dce4c929 100644
--- a/tests/python/contrib/test_ethosu/test_layout_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
@@ -121,6 +121,44 @@ def get_graph():
     _assert_structural_equal(a, b)
 
 
+@pytest.mark.parametrize("dtype", ["int8", "int32"])
+def test_add_reduce_sum(dtype):
+    """Test add with reduce sum to make sure the layouts remain
+    unaltered for int32 and altered for other types.
+    """
+
+    def get_graph(get_expected=False):
+        in_1 = relay.var("x", shape=(1, 2, 2, 2), dtype=dtype)
+        in_2 = relay.var("y", shape=(1, 2, 2, 2), dtype=dtype)
+        layout = "NHCWB16" if get_expected and dtype != "int32" else "NHWC"
+        add = infra.make_ethosu_binary_elementwise(
+            in_1,
+            in_2,
+            ifm_channels=2,
+            ifm2_channels=2,
+            operator_type="ADD",
+            ofm_dtype=dtype,
+            ifm_layout="NHWC",
+            ifm2_layout="NHWC",
+            ofm_layout=layout,
+        )
+        x = infra.make_ethosu_pooling(
+            ifm=add,
+            pooling_type="SUM",
+            pool_shape=(1, 1),
+            ofm_channels=1,
+            strides=(1, 1),
+            padding=(0, 0),
+            ifm_layout=layout,
+            ofm_layout="NHWC",
+        )
+        return relay.Function(relay.analysis.free_vars(x), x)
+
+    a = _optimize(get_graph())
+    b = _optimize(get_graph(get_expected=True), optimize=False)
+    _assert_structural_equal(a, b)
+
+
 def test_multiple_convolution():
     """Test layout optimization pass on linear chain of convolutions. I.e,
 

From d7568b8e6058b077fd2d8e6eabb6305105ebf229 Mon Sep 17 00:00:00 2001
From: Aleksei-grovety <113356454+Aleksei-grovety@users.noreply.github.com>
Date: Thu, 6 Apr 2023 16:00:20 +0400
Subject: [PATCH 2/2] Add test to check reduce_sum operation with different
 input types.

---
 tests/python/contrib/test_ethosu/infra.py     | 18 ++++--
 .../contrib/test_ethosu/test_codegen.py       | 56 +++++++++++++++++++
 2 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index e6ebec6ac4fa..b205a6d3350b 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -702,18 +702,24 @@ def make_ethosu_binary_elementwise(
     rescale_scale: int = 0,
     rescale_shift: int = 0,
     lut=relay.const([], dtype="int8"),
+    ifm_scale: float = 1.0,
+    ifm_zero_point: int = 0,
+    ifm2_scale: float = 1.0,
+    ifm2_zero_point: int = 0,
+    ofm_scale: float = 1.0,
+    ofm_zero_point: int = 0,
 ):
     ethosu_binary_elementwise = ethosu_ops.ethosu_binary_elementwise(
         ifm=ifm,
         ifm2=ifm2,
         lut=lut,
         operator_type=operator_type,
-        ifm_scale=1,
-        ifm_zero_point=0,
-        ifm2_scale=1,
-        ifm2_zero_point=0,
-        ofm_scale=1,
-        ofm_zero_point=0,
+        ifm_scale=ifm_scale,
+        ifm_zero_point=ifm_zero_point,
+        ifm2_scale=ifm2_scale,
+        ifm2_zero_point=ifm2_zero_point,
+        ofm_scale=ofm_scale,
+        ofm_zero_point=ofm_zero_point,
         ifm_channels=ifm_channels,
         ifm2_channels=ifm2_channels,
         reversed_operands=reversed_operands,
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 6eb382d8f588..595a4b9efb85 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -520,6 +520,62 @@ def sum_func(x):
     )
 
 
+# Case to check reduce_sum operation with different input types.
+@pytest.mark.parametrize("dtype", ["int8", "int32"])
+def test_add_reduce_sum(dtype):
+    ifm_shape = (1, 2, 2, 4)
+    accel_type = "ethos-u55-256"
+    np.random.seed(0)
+
+    def create_model():
+        ifm = relay.var("ifm", shape=ifm_shape, dtype=dtype)
+        ifm2 = relay.var("ifm2", shape=ifm_shape, dtype=dtype)
+        ifm_scale = 0.0 if dtype == "int32" else 1.0
+        op = infra.make_ethosu_binary_elementwise(
+            ifm,
+            ifm2,
+            ifm_shape[3],
+            ifm_shape[3],
+            "ADD",
+            dtype,
+            ifm_scale=ifm_scale,
+            ifm2_scale=ifm_scale,
+        )
+        op = infra.make_ethosu_pooling(
+            ifm=op,
+            pooling_type="SUM",
+            pool_shape=(1, 1),
+            ofm_channels=1,
+            strides=(1, 1),
+            padding=(0, 0, 0, 0),
+            rounding_mode="NATURAL",
+        )
+        return tvm.IRModule.from_expr(relay.Function([ifm, ifm2], op))
+
+    def generate_output_data(input_data):
+        lhs = input_data["ifm"]
+        rhs = input_data["ifm2"]
+        # reduce_sum output type is int32.
+        output_dtype = "int32"
+        add = lhs + rhs
+        return [np.sum(add, axis=3).astype(output_dtype)]
+
+    cpu_mod = create_model()
+
+    # Generate reference data
+    in_min, in_max = -10, 19
+    lhs = np.random.randint(in_min, in_max, size=ifm_shape, dtype=dtype)
+    rhs = np.random.randint(in_min, in_max, size=ifm_shape, dtype=dtype)
+    input_data = {
+        "ifm": lhs,
+        "ifm2": rhs,
+    }
+    output_data = {"output": generate_output_data(input_data)[0]}
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
+
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+
+
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize("dtype", ["int8", "uint8"])
 @pytest.mark.parametrize("constant", [np.ones((1, 1, 1, 1)), np.array(1)])