diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py index 2cf45170e4e3..d47b3d4a7de6 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py @@ -18,12 +18,13 @@ """The integration of the Arm(R) Ethos(TM)-U NPU TIR compiler.""" import tvm from tvm import relay -from tvm.relay.expr_functor import ExprMutator from tvm.driver.build_module import schedule_to_module +from tvm.relay.backend.contrib.ethosu import vela_api as vapi +from tvm.relay.expr_functor import ExprMutator +from .. import util from . import passes as ethosu_passes from .scheduler import schedule -from .. import util def lower_ethosu(sch, args, const_dict, name="main"): @@ -92,7 +93,7 @@ def lower_ethosu(sch, args, const_dict, name="main"): mod = ethosu_passes.HoistAllocates()(mod) mod = tvm.tir.transform.RemoveNoOp()(mod) mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod) - mod = ethosu_passes.CopyComputeReordering()(mod) + mod = ethosu_passes.CopyComputeReordering(vapi.get_max_copy_movements())(mod) disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False) if not disable_storage_rewrite: diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py index f241652e738f..45c232a4610b 100644 --- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py +++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py @@ -23,13 +23,15 @@ """ import logging import math -from typing import Tuple, Optional, List +from typing import List, Optional, Tuple + import numpy as np # type: ignore from ethosu.vela import api as vapi # type: ignore +from ethosu.vela.architecture_features import Accelerator, create_default_arch import tvm -from tvm.relay.backend.contrib.ethosu import util # type: ignore from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs +from tvm.relay.backend.contrib.ethosu import util # type: ignore # pylint: disable=invalid-name logger = logging.getLogger("Ethos-U") @@ -400,3 +402,12 @@ def get_accelerator_config() -> vapi.NpuAccelerator: accel_config_str = compiler_attrs.accelerator_config assert accel_config_str in npu_accel_str_map.keys(), f"{accel_config_str} is not supported" return npu_accel_str_map[accel_config_str] + + +def get_max_copy_movements() -> int: + """Get maximum copy movements for CopyComputeReordering pass. + max_outstanding_dma from architecture features indicates how many + DMA operations can be in-progress. + """ + arch = create_default_arch(Accelerator.from_npu_accelerator(get_accelerator_config())) + return arch.max_outstanding_dma diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py index 6a8ff28e442e..4341f367f0e1 100644 --- a/tests/python/contrib/test_ethosu/test_encode_constants.py +++ b/tests/python/contrib/test_ethosu/test_encode_constants.py @@ -66,30 +66,29 @@ def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_writ @tvm.script.ir_module class WeightStreamOnlyU65: @T.prim_func - def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write: T.Buffer((1, 16, 16, 8), "int8")) -> None: - # function attr dict - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - # buffer definition - placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data) - ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data) - buffer_encoded_1 = T.Buffer([192], dtype="uint8") - buffer_encoded_2_1 = T.Buffer([192], dtype="uint8") - buffer_encoded_4_1 = T.Buffer([208], dtype="uint8") - buffer_encoded_6_1 = T.Buffer([192], dtype="uint8") - # body - p1_data = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True}) - p1 = T.Buffer([208], "uint8", data=p1_data) - p2_data = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.Buffer([192], "uint8", data=p2_data) - p3 = T.Buffer([192], dtype="uint8", data=p1.data) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 80, p3[80], 80, 12, p3[160], 16, p3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 208, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 192, p2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 96, p1[96], 80, 12, p1[176], 16, p1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")): + T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)}) + p2_global_6 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)}) + p2_global_4 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)}) + p2_global_5 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)}) + buffer_encoded = T.Buffer((192,), "uint8") + p2_global_3 = T.Buffer((192,), "uint8", data=p2_global_6) + T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 192, p2_global_3[0]) + buffer_encoded_1 = T.Buffer((192,), "uint8") + p2_global_4_1 = T.Buffer((192,), "uint8", data=p2_global_4) + T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 192, p2_global_4_1[0]) + buffer_encoded_2 = T.Buffer((208,), "uint8") + p2_global_5_1 = T.Buffer((208,), "uint8", data=p2_global_5) + T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 208, p2_global_5_1[0]) + ifm_1 = T.Buffer((8192,), "int8", data=ifm.data) + ethosu_write_1 = T.Buffer((2048,), "int8", data=ethosu_write.data) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_3[0], 80, p2_global_3[80], 80, 12, p2_global_3[160], 16, p2_global_3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) + buffer_encoded_3 = T.Buffer((192,), "uint8") + p2_global_6_1 = T.Buffer((192,), "uint8", data=p2_global_6) + T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 192, p2_global_6_1[0]) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_4_1[0], 80, p2_global_4_1[80], 80, 12, p2_global_4_1[160], 16, p2_global_4_1[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_5_1[0], 96, p2_global_5_1[96], 80, 12, p2_global_5_1[176], 16, p2_global_5_1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_6_1[0], 80, p2_global_6_1[80], 80, 12, p2_global_6_1[160], 16, p2_global_6_1[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) __tvm_meta__ = None # fmt: on @@ -387,33 +386,34 @@ def main(input_ifm: T.Buffer((1,16,16,32), "int8"), input_ethosu_write: T.Buffer @tvm.script.ir_module class MixedReadU65: @T.prim_func - def main(input_ifm: T.Buffer((1,16,16,32), "int8"), input_ethosu_write: T.Buffer((1,16,16,8), "int8")) -> None: - # function attr dict - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - # buffer definition - ifm = T.Buffer([8192], dtype="int8", data=input_ifm.data) - ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data) - buffer1 = T.Buffer([128], dtype="uint8") - buffer2 = T.Buffer([128], dtype="uint8") - buffer3 = T.Buffer([128], dtype="uint8") - buffer4 = T.Buffer([608], dtype="uint8") - buffer5 = T.Buffer([160], dtype="uint8") - buffer6 = T.Buffer([128], dtype="uint8") - p1_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - p1 = T.Buffer([128], "uint8", data=p1_data) - p2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.Buffer([4096], "int8", data=p2_data) - p3_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - p3 = T.Buffer([128], "uint8", data=p3_data) - T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 128, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 128, p3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")): + T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)}) + p5_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)}) + p5_global_1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)}) + ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin": T.bool(True)}) + p5_global_2 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)}) + buffer_encoded = T.Buffer((128,), "uint8") + p5_global_3 = T.Buffer((128,), "uint8", data=p5_global) + T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 128, p5_global_3[0]) + buffer_encoded_1 = T.Buffer((128,), "uint8") + p5_global_4 = T.Buffer((128,), "uint8", data=p5_global_1) + T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 128, p5_global_4[0]) + ifm_1 = T.Buffer((8192,), "int8", data=ifm.data) + ethosu_write_2 = T.Buffer((4096,), "int8", data=ethosu_write_1) + p1_encoded = T.Buffer((608,), "uint8") + p2_encoded = T.Buffer((160,), "uint8") + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p1_encoded[0], 304, p1_encoded[304], 304, 12, p2_encoded[0], 80, p2_encoded[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) + buffer_encoded_2 = T.Buffer((128,), "uint8") + p5_global_5 = T.Buffer((128,), "uint8", data=p5_global_2) + T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 128, p5_global_5[0]) + ethosu_write_3 = T.Buffer((2048,), "int8", data=ethosu_write.data) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_3[0], 48, p5_global_3[48], 48, 12, p5_global_3[96], 16, p5_global_3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) + buffer_encoded_3 = T.Buffer((128,), "uint8") + p5_global_6 = T.Buffer((128,), "uint8", data=p5_global) + T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 128, p5_global_6[0]) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_4[0], 48, p5_global_4[48], 48, 12, p5_global_4[96], 16, p5_global_4[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_5[0], 48, p5_global_5[48], 48, 12, p5_global_5[96], 16, p5_global_5[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) + T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_6[0], 48, p5_global_6[48], 48, 12, p5_global_6[96], 16, p5_global_6[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index 0df73d6dc561..a5490cbe2b1c 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -44,8 +44,8 @@ @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1793376), - ("ethos-u65-256", MOBILENET_V2_URL, 2217152), + ("ethos-u65-256", MOBILENET_V1_URL, 2338848), + ("ethos-u65-256", MOBILENET_V2_URL, 2264320), ("ethos-u55-256", MOBILENET_V1_URL, 1793376), ("ethos-u55-256", MOBILENET_V2_URL, 2217152), ("ethos-u55-128", MOBILENET_V2_URL, 2217152), @@ -71,7 +71,7 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size): @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1206880), + ("ethos-u65-256", MOBILENET_V1_URL, 1311200), ("ethos-u55-256", MOBILENET_V2_URL, 1509408), ], )