diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py index b07b260f1965..5a5f1478e16e 100644 --- a/python/tvm/relay/backend/contrib/ethosu/codegen.py +++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py @@ -16,8 +16,9 @@ # under the License. """Codegen for Arm(R) Ethos(TM)-U NPU""" from collections import defaultdict - from typing import List, Callable + +from ethosu.vela import api as vapi import tvm from tvm import relay from tvm.relay.backend.contrib.ethosu.tir.compiler import LowerToTIR @@ -30,7 +31,7 @@ extract_memory_info, ) from tvm.relay.backend.contrib.ethosu.legalize import LegalizeEthosU -from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator, util +from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator, util, vela_api from tvm.relay.expr_functor import ExprMutator, ExprVisitor # pylint: disable=unused-import @@ -143,20 +144,25 @@ def __call__(self, *args, **kwargs): class AnalyzeConsumers(ExprVisitor): - """Traverses the graph to determine consumers that are NPU operations. The - result is maintained in `npu_consumers`. + """Traverses the graph to determine consumers that are NPU operations and + which have restrictions to use NHCWB16 layout. The result is maintained in + `npu_consumers` and `restrictions`. Attributes ---------- npu_consumers : Dict[tvm.relay.expr.Call, List[bool]] Mapping from NPU operation to list of boolean values that represent whether or not each consumer is an NPU operation. + restrictions : Dict[tvm.relay.expr.Call, List[bool]] + Mapping from NPU operation to list of boolean values that represent + whether or not operation has restrictions to use NHCWB16 layout. optimize_ops : Dict[str, Callable] A map from NPU operation name to function that creates NPU operation. """ def __init__(self, optimize_ops): self.npu_consumers = defaultdict(list) + self.restrictions = defaultdict(list) self.optimize_ops = optimize_ops super().__init__() @@ -174,6 +180,18 @@ def visit_call(self, call: relay.Call): for arg in args: if isinstance(arg, relay.Call) and arg.op.name in self.optimize_ops: self.npu_consumers[arg].append(is_npu_consumer) + # ReduceSum requires NHWC input in case input tensor has type int32 or + # accelerator is Ethos_U65_512 + # https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/tags/3.7.0/ethosu/vela/graph_optimiser_util.py#126 + has_restrictions = ( + call.op.name == "contrib.ethosu.pooling" + and call.attrs["pooling_type"] == "SUM" + and ( + arg.checked_type.dtype == "int32" + or vela_api.get_accelerator_config() == vapi.NpuAccelerator.Ethos_U65_512 + ) + ) + self.restrictions[arg].append(has_restrictions) super().visit_call(call) @@ -185,11 +203,11 @@ class LayoutOptimization(ExprMutator): operation depends on the following: Check alter input layout: For each argument, if the producer is also an NPU operation and - its output is altered to brick format, then the input layout with respect to the current - argument is altered to brick format. + its output is altered to brick format and there are no restrictions, then the input layout + with respect to the current argument is altered to brick format. - Check alter output layout: If all consumers (child nodes) are an NPU operation, then the - output layout is altered to brick format. + Check alter output layout: If all consumers (child nodes) are an NPU operation and + there are no restrictions, then the output layout is altered to brick format. Note ---- @@ -198,15 +216,19 @@ class LayoutOptimization(ExprMutator): Attributes ---------- - npu_consumers : Dict[tvm.relay.expr.Call, bool] + npu_consumers : Dict[tvm.relay.expr.Call, List[bool]] A map from current call to a list boolean values that state whether or not each consumer is an NPU operation. + restrictions : Dict[tvm.relay.expr.Call, List[bool]] + A map from current call to a list boolean values that state + whether or not operation has restrictions to use NHCWB16 layout. optimize_ops : Dict[str, Callable] A map from NPU operation name to function that creates NPU operation. """ - def __init__(self, npu_consumers, optimize_ops): + def __init__(self, npu_consumers, restrictions, optimize_ops): self.npu_consumers = npu_consumers + self.restrictions = restrictions self.optimize_ops = optimize_ops super().__init__() @@ -224,6 +246,39 @@ def alter_ethosu_op_layout(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Ca new_call : tvm.relay.expr.Call New call with altered layouts. """ + + def are_all_consumers_npu(call): + """ + Check whether or not each consumer is an NPU operation. + Parameters + ---------- + call : tvm.relay.expr.Call + The call pointing to an NPU operation. + + Returns + ------- + all_consumers_npu : bool + Whether each consumer is an NPU operation. + """ + consumers = self.npu_consumers[call] + return consumers and all(consumers) + + def check_restrictions(call): + """ + Check if there are any restrictions for call to use NHCWB16 layout. + Parameters + ---------- + call : tvm.relay.expr.Call + The call pointing to an NPU operation. + + Returns + ------- + any_restrictions : bool + Whether there are restrictions. + """ + restrictions = self.restrictions[call] + return restrictions and any(restrictions) + assert isinstance(call.attrs, tvm.ir.Attrs), ( f"The attributes for operator '{call.op.name}' could not be " "found. Did you register the relay.attrs.EthosuAttrs " @@ -238,15 +293,16 @@ def alter_ethosu_op_layout(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Ca input_count += 1 if arg not in self.npu_consumers: continue - consumers = self.npu_consumers[arg] - parent_has_brick_output = consumers and all(consumers) - if parent_has_brick_output: + parent_has_brick_output = are_all_consumers_npu(arg) + parent_has_restrictions = check_restrictions(arg) + if parent_has_brick_output and not parent_has_restrictions: layout_string = "ifm_layout" if input_count <= 1 else f"ifm{input_count}_layout" new_attrs[layout_string] = "NHCWB16" # Check if we can rewrite the output layouts - consumers = self.npu_consumers[call] - if consumers and all(consumers): + has_brick_output = are_all_consumers_npu(call) + has_restrictions = check_restrictions(call) + if has_brick_output and not has_restrictions: new_attrs["ofm_layout"] = "NHCWB16" name = call.op.name @@ -293,7 +349,9 @@ def transform_npu_function(self, _, func: relay.Function) -> relay.Function: analyze = AnalyzeConsumers(optimize_ops) analyze.visit(func) - return LayoutOptimization(analyze.npu_consumers, optimize_ops).visit(func) + return LayoutOptimization(analyze.npu_consumers, analyze.restrictions, optimize_ops).visit( + func + ) def __call__(self, *args, **kwargs): pass diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py index e6ebec6ac4fa..b205a6d3350b 100644 --- a/tests/python/contrib/test_ethosu/infra.py +++ b/tests/python/contrib/test_ethosu/infra.py @@ -702,18 +702,24 @@ def make_ethosu_binary_elementwise( rescale_scale: int = 0, rescale_shift: int = 0, lut=relay.const([], dtype="int8"), + ifm_scale: float = 1.0, + ifm_zero_point: int = 0, + ifm2_scale: float = 1.0, + ifm2_zero_point: int = 0, + ofm_scale: float = 1.0, + ofm_zero_point: int = 0, ): ethosu_binary_elementwise = ethosu_ops.ethosu_binary_elementwise( ifm=ifm, ifm2=ifm2, lut=lut, operator_type=operator_type, - ifm_scale=1, - ifm_zero_point=0, - ifm2_scale=1, - ifm2_zero_point=0, - ofm_scale=1, - ofm_zero_point=0, + ifm_scale=ifm_scale, + ifm_zero_point=ifm_zero_point, + ifm2_scale=ifm2_scale, + ifm2_zero_point=ifm2_zero_point, + ofm_scale=ofm_scale, + ofm_zero_point=ofm_zero_point, ifm_channels=ifm_channels, ifm2_channels=ifm2_channels, reversed_operands=reversed_operands, diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py index 6eb382d8f588..595a4b9efb85 100644 --- a/tests/python/contrib/test_ethosu/test_codegen.py +++ b/tests/python/contrib/test_ethosu/test_codegen.py @@ -520,6 +520,62 @@ def sum_func(x): ) +# Case to check reduce_sum operation with different input types. +@pytest.mark.parametrize("dtype", ["int8", "int32"]) +def test_add_reduce_sum(dtype): + ifm_shape = (1, 2, 2, 4) + accel_type = "ethos-u55-256" + np.random.seed(0) + + def create_model(): + ifm = relay.var("ifm", shape=ifm_shape, dtype=dtype) + ifm2 = relay.var("ifm2", shape=ifm_shape, dtype=dtype) + ifm_scale = 0.0 if dtype == "int32" else 1.0 + op = infra.make_ethosu_binary_elementwise( + ifm, + ifm2, + ifm_shape[3], + ifm_shape[3], + "ADD", + dtype, + ifm_scale=ifm_scale, + ifm2_scale=ifm_scale, + ) + op = infra.make_ethosu_pooling( + ifm=op, + pooling_type="SUM", + pool_shape=(1, 1), + ofm_channels=1, + strides=(1, 1), + padding=(0, 0, 0, 0), + rounding_mode="NATURAL", + ) + return tvm.IRModule.from_expr(relay.Function([ifm, ifm2], op)) + + def generate_output_data(input_data): + lhs = input_data["ifm"] + rhs = input_data["ifm2"] + # reduce_sum output type is int32. + output_dtype = "int32" + add = lhs + rhs + return [np.sum(add, axis=3).astype(output_dtype)] + + cpu_mod = create_model() + + # Generate reference data + in_min, in_max = -10, 19 + lhs = np.random.randint(in_min, in_max, size=ifm_shape, dtype=dtype) + rhs = np.random.randint(in_min, in_max, size=ifm_shape, dtype=dtype) + input_data = { + "ifm": lhs, + "ifm2": rhs, + } + output_data = {"output": generate_output_data(input_data)[0]} + ethosu_mod = infra.create_ethosu_partition(cpu_mod) + + infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type) + + @pytest.mark.parametrize("accel_type", ACCEL_TYPES) @pytest.mark.parametrize("dtype", ["int8", "uint8"]) @pytest.mark.parametrize("constant", [np.ones((1, 1, 1, 1)), np.array(1)]) diff --git a/tests/python/contrib/test_ethosu/test_layout_optimizer.py b/tests/python/contrib/test_ethosu/test_layout_optimizer.py index 9cc3637fdf8c..05b9dce4c929 100644 --- a/tests/python/contrib/test_ethosu/test_layout_optimizer.py +++ b/tests/python/contrib/test_ethosu/test_layout_optimizer.py @@ -121,6 +121,44 @@ def get_graph(): _assert_structural_equal(a, b) +@pytest.mark.parametrize("dtype", ["int8", "int32"]) +def test_add_reduce_sum(dtype): + """Test add with reduce sum to make sure the layouts remain + unaltered for int32 and altered for other types. + """ + + def get_graph(get_expected=False): + in_1 = relay.var("x", shape=(1, 2, 2, 2), dtype=dtype) + in_2 = relay.var("y", shape=(1, 2, 2, 2), dtype=dtype) + layout = "NHCWB16" if get_expected and dtype != "int32" else "NHWC" + add = infra.make_ethosu_binary_elementwise( + in_1, + in_2, + ifm_channels=2, + ifm2_channels=2, + operator_type="ADD", + ofm_dtype=dtype, + ifm_layout="NHWC", + ifm2_layout="NHWC", + ofm_layout=layout, + ) + x = infra.make_ethosu_pooling( + ifm=add, + pooling_type="SUM", + pool_shape=(1, 1), + ofm_channels=1, + strides=(1, 1), + padding=(0, 0), + ifm_layout=layout, + ofm_layout="NHWC", + ) + return relay.Function(relay.analysis.free_vars(x), x) + + a = _optimize(get_graph()) + b = _optimize(get_graph(get_expected=True), optimize=False) + _assert_structural_equal(a, b) + + def test_multiple_convolution(): """Test layout optimization pass on linear chain of convolutions. I.e,