Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
"""The integration of the Arm(R) Ethos(TM)-U NPU TIR compiler."""
import tvm
from tvm import relay
from tvm.relay.expr_functor import ExprMutator
from tvm.driver.build_module import schedule_to_module
from tvm.relay.backend.contrib.ethosu import vela_api as vapi
from tvm.relay.expr_functor import ExprMutator

from .. import util
from . import passes as ethosu_passes
from .scheduler import schedule
from .. import util


def lower_ethosu(sch, args, const_dict, name="main"):
Expand Down Expand Up @@ -92,7 +93,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
mod = ethosu_passes.HoistAllocates()(mod)
mod = tvm.tir.transform.RemoveNoOp()(mod)
mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod)
mod = ethosu_passes.CopyComputeReordering()(mod)
mod = ethosu_passes.CopyComputeReordering(vapi.get_max_copy_movements())(mod)

disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
if not disable_storage_rewrite:
Expand Down
15 changes: 13 additions & 2 deletions python/tvm/relay/backend/contrib/ethosu/vela_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@
"""
import logging
import math
from typing import Tuple, Optional, List
from typing import List, Optional, Tuple

import numpy as np # type: ignore
from ethosu.vela import api as vapi # type: ignore
from ethosu.vela.architecture_features import Accelerator, create_default_arch

import tvm
from tvm.relay.backend.contrib.ethosu import util # type: ignore
from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
from tvm.relay.backend.contrib.ethosu import util # type: ignore

# pylint: disable=invalid-name
logger = logging.getLogger("Ethos-U")
Expand Down Expand Up @@ -400,3 +402,12 @@ def get_accelerator_config() -> vapi.NpuAccelerator:
accel_config_str = compiler_attrs.accelerator_config
assert accel_config_str in npu_accel_str_map.keys(), f"{accel_config_str} is not supported"
return npu_accel_str_map[accel_config_str]


def get_max_copy_movements() -> int:
"""Get maximum copy movements for CopyComputeReordering pass.
max_outstanding_dma from architecture features indicates how many
DMA operations can be in-progress.
"""
arch = create_default_arch(Accelerator.from_npu_accelerator(get_accelerator_config()))
return arch.max_outstanding_dma
102 changes: 51 additions & 51 deletions tests/python/contrib/test_ethosu/test_encode_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,30 +66,29 @@ def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_writ
@tvm.script.ir_module
class WeightStreamOnlyU65:
@T.prim_func
def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write: T.Buffer((1, 16, 16, 8), "int8")) -> None:
# function attr dict
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
# buffer definition
placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
buffer_encoded_1 = T.Buffer([192], dtype="uint8")
buffer_encoded_2_1 = T.Buffer([192], dtype="uint8")
buffer_encoded_4_1 = T.Buffer([208], dtype="uint8")
buffer_encoded_6_1 = T.Buffer([192], dtype="uint8")
# body
p1_data = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True})
p1 = T.Buffer([208], "uint8", data=p1_data)
p2_data = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True})
p2 = T.Buffer([192], "uint8", data=p2_data)
p3 = T.Buffer([192], dtype="uint8", data=p1.data)
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 80, p3[80], 80, 12, p3[160], 16, p3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 208, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 192, p2[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 96, p1[96], 80, 12, p1[176], 16, p1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")):
T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
p2_global_6 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p2_global_4 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p2_global_5 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
buffer_encoded = T.Buffer((192,), "uint8")
p2_global_3 = T.Buffer((192,), "uint8", data=p2_global_6)
T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 192, p2_global_3[0])
buffer_encoded_1 = T.Buffer((192,), "uint8")
p2_global_4_1 = T.Buffer((192,), "uint8", data=p2_global_4)
T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 192, p2_global_4_1[0])
buffer_encoded_2 = T.Buffer((208,), "uint8")
p2_global_5_1 = T.Buffer((208,), "uint8", data=p2_global_5)
T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 208, p2_global_5_1[0])
ifm_1 = T.Buffer((8192,), "int8", data=ifm.data)
ethosu_write_1 = T.Buffer((2048,), "int8", data=ethosu_write.data)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_3[0], 80, p2_global_3[80], 80, 12, p2_global_3[160], 16, p2_global_3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
buffer_encoded_3 = T.Buffer((192,), "uint8")
p2_global_6_1 = T.Buffer((192,), "uint8", data=p2_global_6)
T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 192, p2_global_6_1[0])
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_4_1[0], 80, p2_global_4_1[80], 80, 12, p2_global_4_1[160], 16, p2_global_4_1[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_5_1[0], 96, p2_global_5_1[96], 80, 12, p2_global_5_1[176], 16, p2_global_5_1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_6_1[0], 80, p2_global_6_1[80], 80, 12, p2_global_6_1[160], 16, p2_global_6_1[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
__tvm_meta__ = None
# fmt: on

Expand Down Expand Up @@ -387,33 +386,34 @@ def main(input_ifm: T.Buffer((1,16,16,32), "int8"), input_ethosu_write: T.Buffer
@tvm.script.ir_module
class MixedReadU65:
@T.prim_func
def main(input_ifm: T.Buffer((1,16,16,32), "int8"), input_ethosu_write: T.Buffer((1,16,16,8), "int8")) -> None:
# function attr dict
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
# buffer definition
ifm = T.Buffer([8192], dtype="int8", data=input_ifm.data)
ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
buffer1 = T.Buffer([128], dtype="uint8")
buffer2 = T.Buffer([128], dtype="uint8")
buffer3 = T.Buffer([128], dtype="uint8")
buffer4 = T.Buffer([608], dtype="uint8")
buffer5 = T.Buffer([160], dtype="uint8")
buffer6 = T.Buffer([128], dtype="uint8")
p1_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
p1 = T.Buffer([128], "uint8", data=p1_data)
p2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
p2 = T.Buffer([4096], "int8", data=p2_data)
p3_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
p3 = T.Buffer([128], "uint8", data=p3_data)
T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 128, p1[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 128, p3[0], dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")):
T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
p5_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p5_global_1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin": T.bool(True)})
p5_global_2 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
buffer_encoded = T.Buffer((128,), "uint8")
p5_global_3 = T.Buffer((128,), "uint8", data=p5_global)
T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 128, p5_global_3[0])
buffer_encoded_1 = T.Buffer((128,), "uint8")
p5_global_4 = T.Buffer((128,), "uint8", data=p5_global_1)
T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 128, p5_global_4[0])
ifm_1 = T.Buffer((8192,), "int8", data=ifm.data)
ethosu_write_2 = T.Buffer((4096,), "int8", data=ethosu_write_1)
p1_encoded = T.Buffer((608,), "uint8")
p2_encoded = T.Buffer((160,), "uint8")
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p1_encoded[0], 304, p1_encoded[304], 304, 12, p2_encoded[0], 80, p2_encoded[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
buffer_encoded_2 = T.Buffer((128,), "uint8")
p5_global_5 = T.Buffer((128,), "uint8", data=p5_global_2)
T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 128, p5_global_5[0])
ethosu_write_3 = T.Buffer((2048,), "int8", data=ethosu_write.data)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_3[0], 48, p5_global_3[48], 48, 12, p5_global_3[96], 16, p5_global_3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
buffer_encoded_3 = T.Buffer((128,), "uint8")
p5_global_6 = T.Buffer((128,), "uint8", data=p5_global)
T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 128, p5_global_6[0])
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_4[0], 48, p5_global_4[48], 48, 12, p5_global_4[96], 16, p5_global_4[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_5[0], 48, p5_global_5[48], 48, 12, p5_global_5[96], 16, p5_global_5[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_6[0], 48, p5_global_6[48], 48, 12, p5_global_6[96], 16, p5_global_6[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
__tvm_meta__ = None
# fmt: on

Expand Down
6 changes: 3 additions & 3 deletions tests/python/contrib/test_ethosu/test_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
("ethos-u65-256", MOBILENET_V1_URL, 1793376),
("ethos-u65-256", MOBILENET_V2_URL, 2217152),
("ethos-u65-256", MOBILENET_V1_URL, 2338848),
("ethos-u65-256", MOBILENET_V2_URL, 2264320),
("ethos-u55-256", MOBILENET_V1_URL, 1793376),
("ethos-u55-256", MOBILENET_V2_URL, 2217152),
("ethos-u55-128", MOBILENET_V2_URL, 2217152),
Expand All @@ -71,7 +71,7 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
("ethos-u65-256", MOBILENET_V1_URL, 1206880),
("ethos-u65-256", MOBILENET_V1_URL, 1311200),
("ethos-u55-256", MOBILENET_V2_URL, 1509408),
],
)
Expand Down