From f1a16c8d5c18ee8fa3d0e93f1c69cc68231cd2e4 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 12 Jul 2023 09:10:48 -0500
Subject: [PATCH 1/6] [Ethos-U] Replace ethos-u.constants with AllocateConst

Previously, constants for ethos-u were tracked using a function
attribute `"ethos-u.constants"`.  This predates the introduction of
`AllocateConst`, and had comments indicating that it should be
replaced with `AllocateConst` when possible.

To minimize impact to existing passes, this commit preserves the
`"ethos-u.constants"` attribute during ethosu-specific lowering
passes.  The attribute is converted to `AllocateConst` at the end of
the `lower_ethosu` pass, just prior to lowering with the usual TIR
passes.
---
 .../relay/backend/contrib/ethosu/codegen.py   |  37 ++++-
 .../backend/contrib/ethosu/tir/compiler.py    |  20 +--
 .../backend/contrib/ethosu/tir/passes.py      |  26 ++--
 python/tvm/relay/op/contrib/ethosu.py         |  32 ++--
 src/relay/backend/contrib/ethosu/codegen.cc   |   3 -
 src/tir/ir/stmt.cc                            |   8 +-
 .../test_ethosu/cascader/test_integration.py  |   8 +-
 tests/python/contrib/test_ethosu/infra.py     |  59 +++++++
 .../contrib/test_ethosu/test_compiler.py      |   2 +-
 .../test_ethosu/test_encode_constants.py      | 147 ++++++++++++------
 .../test_ethosu/test_remove_concatenates.py   |  32 ++--
 .../test_replace_binary_elementwise.py        |   4 +-
 .../test_ethosu/test_replace_conv2d.py        |  85 ++++++----
 .../contrib/test_ethosu/test_replace_copy.py  |  21 ++-
 .../test_replace_depthwise_conv2d.py          |   2 +-
 .../test_ethosu/test_replace_identity.py      |   2 +-
 .../test_ethosu/test_replace_pooling.py       |   6 +-
 .../test_replace_unary_elementwise.py         |   2 +-
 .../contrib/test_ethosu/test_scheduler.py     |  13 +-
 .../test_tir_transform_make_packed_api.py     |   5 +-
 20 files changed, 361 insertions(+), 153 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index 04b40a9e64ea..cf595ab2cec0 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -16,7 +16,7 @@
 # under the License.
 """Codegen for Arm(R) Ethos(TM)-U NPU"""
 from collections import defaultdict
-from typing import List, Callable
+from typing import List, Callable, Dict
 
 from ethosu.vela import api as vapi
 import tvm
@@ -720,6 +720,34 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     return tir_mod
 
 
+def collect_consts(mod: tvm.IRModule) -> Dict[tvm.tir.Var, tvm.nd.NDArray]:
+    """Collect any AllocateConst
+
+    Parameters
+    ----------
+    mod: tvm.IRModule
+
+        The module to inspect.
+
+    Returns
+    -------
+    const_dict: Dict[tvm.tir.Var, tvm.nd.NDArray]
+
+        A map from buffer var to NDArray, from AllocateConst nodes in
+        the module
+    """
+    constants = {}
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.AllocateConst):
+            constants[stmt.buffer_var] = stmt.data
+
+    for func in mod.functions.values():
+        tvm.tir.stmt_functor.post_order_visit(func.body, _visit)
+
+    return constants
+
+
 @tvm._ffi.register_func("relay.ext.ethos-u.primfunc_to_artifact")
 def primfunc_to_artifact(primfunc: tvm.tir.PrimFunc) -> util.CompilationArtifact:
     """
@@ -739,13 +767,12 @@ def primfunc_to_artifact(primfunc: tvm.tir.PrimFunc) -> util.CompilationArtifact
         for the microNPU
     """
     symbol = str(primfunc.attrs["global_symbol"])
-    const_dict = primfunc.attrs["ethos-u.constants"]
     tir_mod = tvm.IRModule()
     tir_mod[symbol] = primfunc
 
-    const_dict_np = dict()
-    for buffer_var in const_dict.keys():
-        const_dict_np[buffer_var] = const_dict[buffer_var].numpy()
+    const_dict_np = {
+        buffer_var: ndarray.numpy() for buffer_var, ndarray in collect_consts(tir_mod).items()
+    }
 
     cmms, encoded_constants, base_addresses = tir_to_cs_translator.translate(tir_mod, const_dict_np)
     return util.CompilationArtifact(symbol, cmms, encoded_constants, base_addresses)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index d47b3d4a7de6..00d5692d588c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -49,9 +49,6 @@ def lower_ethosu(sch, args, const_dict, name="main"):
     -------
     mod : tvm.IRModule
         The lowered TIR module.
-    const_dict : dict of int to numpy.ndarray
-        The modified constant dictionary.
-
     """
     if not isinstance(args, list):
         args = list(args.inputs) + list(args.outputs)
@@ -101,8 +98,8 @@ def lower_ethosu(sch, args, const_dict, name="main"):
 
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod = ethosu_passes.AnnotateAllocates()(mod)
-        mod, const_dict = ethosu_passes.CreatePrimFuncWithoutConstants(const_dict)(mod)
-    return mod, const_dict
+        mod = ethosu_passes.CreatePrimFuncWithoutConstants(const_dict)(mod)
+    return mod
 
 
 def lower_to_te(prim_func):
@@ -200,15 +197,11 @@ def __init__(self, scheduler):
     def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
         """Lower NPU functions to TIR."""
 
-        tir_mod, const_dict = _lower_to_tir(func, self.scheduler)
-
-        for param in const_dict.keys():
-            const_dict[param] = tvm.nd.array(const_dict[param])
+        tir_mod = _lower_to_tir(func, self.scheduler)
 
         compiler_name = "ethos-u"
         primfunc = tir_mod["main"]
         primfunc = primfunc.with_attr("global_symbol", func.attrs["global_symbol"])
-        primfunc = primfunc.with_attr("ethos-u.constants", const_dict)
         primfunc = primfunc.with_attr("target", tvm.target.Target(compiler_name))
         return primfunc
 
@@ -233,14 +226,11 @@ def _lower_to_tir(func, cascader=None):
     -------
     mod : tvm.IRModule
         The lowered TIR module.
-    consts : dict of int to numpy.ndarray
-        A dict of the extracted constants keyed by their param index.
-
     """
     func, consts = extract_constants(func)
     mod = tvm.IRModule.from_expr(func)
     func = relay.transform.InferType()(mod)["main"]
     cached_func = lower_to_te(func)
     s = schedule(cached_func, consts, cascader)
-    mod, consts = lower_ethosu(s, cached_func, consts)
-    return mod, consts
+    mod = lower_ethosu(s, cached_func, consts)
+    return mod
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index 9636f2044733..007b55a812af 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -911,8 +911,7 @@ def _ftransform(f, mod, ctx):
 def CreatePrimFuncWithoutConstants(const_dict):
     """
     This pass will remove arguments that are constants
-    from PrimFunc Args. These should be replaced properly
-    with tir.allocate_const when it becomes available.
+    from PrimFunc Args, replacing them with tir.allocate_const.
 
     It also modifies the constant dictionary to
     rewrite the keys as the actual tir.Vars that are params
@@ -920,22 +919,27 @@ def CreatePrimFuncWithoutConstants(const_dict):
     arguments that represent constants.
     """
 
-    new_const_dict = dict()
-
     def _ftransform(f, mod, ctx):
         new_params = list()
         new_buffer_map = dict()
-        for param_idx in const_dict.keys():
-            # We are using buffer_var to key the constants as
-            # PrimFunc params of constants will be removed.
-            new_const_dict[f.buffer_map[f.params[param_idx]].data] = const_dict[param_idx]
+
+        new_body = f.body
+
         for i, param in enumerate(f.params):
-            if i not in const_dict.keys():
+            if i in const_dict:
+                const_np = const_dict[i]
+                const_ndarray = tvm.nd.array(const_np, device=tvm.cpu())
+                buf = f.buffer_map[param]
+                new_body = tvm.tir.AllocateConst(
+                    buf.data, buf.dtype, buf.shape, const_ndarray, new_body
+                )
+            else:
                 new_params.append(param)
                 new_buffer_map[param] = f.buffer_map[param]
+
         return tvm.tir.PrimFunc(
             new_params,
-            f.body,
+            new_body,
             f.ret_type,
             new_buffer_map,
             f.attrs,
@@ -947,7 +951,7 @@ def _create_primfunc_without_constants(mod):
             _ftransform, opt_level=0, name="tir.contrib.ethos-u.CreatePrimFuncWithoutConstants"
         )
         mod = transform_func(mod)
-        return mod, new_const_dict
+        return mod
 
     return _create_primfunc_without_constants
 
diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index 386ef9038e49..6e9c610de5d4 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -2347,14 +2347,26 @@ def partition_for_ethosu(
         mod["main"] = bind_params_by_name(mod["main"], params)
 
     pattern = relay.op.contrib.get_pattern_table("ethos-u")
-    mod = relay.transform.InferType()(mod)
-    mod = codegen.replicate_pads(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.transform.MergeComposite(pattern)(mod)
-    mod = relay.transform.AnnotateTarget("ethos-u")(mod)
-    mod = relay.transform.MergeCompilerRegions()(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.transform.PartitionGraph(mod_name)(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = preprocess.preprocess_ext_io()(mod)
+
+    seq = tvm.ir.transform.Sequential(
+        [
+            relay.transform.InferType(),
+            tvm.ir.transform.module_pass(
+                lambda mod, context: codegen.replicate_pads(mod),
+                opt_level=0,
+                name="ethosu.replicate_pads",
+            ),
+            relay.transform.InferType(),
+            relay.transform.MergeComposite(pattern),
+            relay.transform.AnnotateTarget("ethos-u"),
+            relay.transform.MergeCompilerRegions(),
+            relay.transform.InferType(),
+            relay.transform.PartitionGraph(mod_name),
+            relay.transform.InferType(),
+            preprocess.preprocess_ext_io(),
+        ],
+        name="partition_for_ethosu",
+    )
+    mod = seq(mod)
+
     return mod
diff --git a/src/relay/backend/contrib/ethosu/codegen.cc b/src/relay/backend/contrib/ethosu/codegen.cc
index 54d0595c4634..a51c8ade24a7 100644
--- a/src/relay/backend/contrib/ethosu/codegen.cc
+++ b/src/relay/backend/contrib/ethosu/codegen.cc
@@ -307,9 +307,6 @@ runtime::Module TIRToRuntime(IRModule mod, Target target) {
   Array<CompilationArtifact> compile_artifacts;
   for (const auto& kv : mod->functions) {
     const tir::PrimFunc& prim_func = Downcast<tir::PrimFunc>(kv.second);
-    Optional<Map<Integer, runtime::NDArray>> params =
-        prim_func->GetAttr<Map<Integer, runtime::NDArray>>("ethos-u.constants");
-    ICHECK(params) << "microNPU params should be present";
     auto primfunc_to_artifact_pf =
         tvm::runtime::Registry::Get("relay.ext.ethos-u.primfunc_to_artifact");
     ICHECK(primfunc_to_artifact_pf);
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 1d1e674a9dd1..5302bf632ab2 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -287,6 +287,7 @@ AllocateConst::AllocateConst(Var buffer_var, DataType dtype, Array<PrimExpr> ext
   }
   ICHECK(body.defined());
   ICHECK(data_or_idx.defined());
+  ICHECK(annotations.defined());
 
   ObjectPtr<AllocateConstNode> node = make_object<AllocateConstNode>();
   node->buffer_var = std::move(buffer_var);
@@ -323,9 +324,10 @@ int64_t AllocateConstNode::ConstantAllocationSize(const Array<PrimExpr>& extents
 }
 TVM_REGISTER_GLOBAL("tir.AllocateConst")
     .set_body_typed([](Var buffer_var, DataType dtype, Array<PrimExpr> extents,
-                       ObjectRef data_or_idx, Stmt body, Map<String, ObjectRef> annotations,
-                       Span span) {
-      return AllocateConst(buffer_var, dtype, extents, data_or_idx, body, annotations, span);
+                       ObjectRef data_or_idx, Stmt body,
+                       Optional<Map<String, ObjectRef>> annotations, Span span) {
+      return AllocateConst(buffer_var, dtype, extents, data_or_idx, body,
+                           annotations.value_or(Map<String, ObjectRef>()), span);
     });
 
 TVM_REGISTER_NODE_TYPE(AllocateConstNode);
diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py b/tests/python/contrib/test_ethosu/cascader/test_integration.py
index 14cc8fbc61cf..29b994b0acbd 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_integration.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_integration.py
@@ -74,7 +74,9 @@ def _compile_model(relay_function):
     mod = tvm.IRModule()
     mod["main"] = relay_function
     mod = relay.transform.InferType()(mod)
-    tir_mod = _lower_to_tir(mod["main"], _ethos_u55_cascader())[0]
+    func = mod["main"]
+    cascader = _ethos_u55_cascader()
+    tir_mod = _lower_to_tir(func, cascader)
     return tir_mod["main"]
 
 
@@ -109,7 +111,7 @@ def test_single_conv_compute_cycles_hint():
     for single convolution.
     """
     primfunc = _compile_model(_create_single_conv2d())
-    ops = primfunc.body.body.seq
+    ops = primfunc.body.body.body.seq
     compute_cycles_hints = [2944, 320]
     for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
         assert op.attr_key == "pragma_compute_cycles_hint"
@@ -135,7 +137,7 @@ def test_scalar_add_compute_cycles_hint():
     for add with scalar values.
     """
     primfunc = _compile_model(_create_scalar_add())
-    ops = primfunc.body.body.seq
+    ops = primfunc.body.body.body.seq
 
     compute_cycles_hints = [16, 24]
     for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 71e7e029c148..a7de2a6eab28 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -789,3 +789,62 @@ def make_ethosu_unary_elementwise(
         ofm_layout=ofm_layout,
     )
     return ethosu_unary_elementwise
+
+
+def copy_allocate_const_data(test_mod: tvm.IRModule, reference_mod: tvm.IRModule) -> tvm.IRModule:
+    """For testing purposes, copy the NDArray into refernece
+
+    NDArray does not implement SEqual, so StructuralEqual defaults to
+    pointer equality.  Since the reference module and the test module
+    were generated separately, they won't have the same NDArray.
+    Therefore, copy it over before StructuralEqual.
+    """
+
+    def collect_ndarray(func):
+        output = []
+
+        def fvisit(node):
+            if isinstance(node, tvm.tir.AllocateConst):
+                output.append(node.data)
+
+        tvm.tir.stmt_functor.post_order_visit(func.body, fvisit)
+
+        return output
+
+    def inject_ndarray(func, data_arrays):
+        def fvisit(node):
+            if data_arrays and isinstance(node, tvm.tir.AllocateConst):
+                data = data_arrays.pop(0)
+                return tvm.tir.AllocateConst(
+                    buffer_var=node.buffer_var,
+                    dtype=node.dtype,
+                    extents=node.extents,
+                    data_or_idx=data,
+                    body=node.body,
+                    annotations=node.annotations,
+                    span=node.span,
+                )
+            else:
+                return node
+
+        body = tvm.tir.stmt_functor.ir_transform(func.body, lambda node: None, fvisit)
+        if body.same_as(func.body):
+            return func
+        else:
+            return tvm.tir.PrimFunc(
+                func.params, body, func.ret_type, func.buffer_map, func.attrs, func.span
+            )
+
+    data_arrays = {
+        gvar.name_hint: collect_ndarray(func)
+        for gvar, func in test_mod.functions.items()
+        if isinstance(func, tvm.tir.PrimFunc)
+    }
+
+    new_module = {}
+    for gvar, func in reference_mod.functions.items():
+        if isinstance(func, tvm.tir.PrimFunc):
+            if gvar.name_hint in data_arrays:
+                func = inject_ndarray(func, data_arrays[gvar.name_hint])
+        new_module[gvar] = func
+    return tvm.IRModule(new_module)
diff --git a/tests/python/contrib/test_ethosu/test_compiler.py b/tests/python/contrib/test_ethosu/test_compiler.py
index 3bf7abb8f113..b6a04d4dd54a 100644
--- a/tests/python/contrib/test_ethosu/test_compiler.py
+++ b/tests/python/contrib/test_ethosu/test_compiler.py
@@ -57,7 +57,7 @@ def test_lower_to_tir_arg_count(relay_function, arg_count):
     mod = tvm.IRModule()
     mod["main"] = relay_function()
     mod = relay.transform.InferType()(mod)
-    tir_mod = _lower_to_tir(mod["main"])[0]
+    tir_mod = _lower_to_tir(mod["main"])
     primfunc = tir_mod["main"]
     assert len(primfunc.params) == arg_count
 
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 4341f367f0e1..3c9be7c575da 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import relay
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
+from tvm.relay.backend.contrib.ethosu.codegen import collect_consts
 from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import (
     OperatorCompute,
@@ -29,8 +30,7 @@
 from tvm.relay.testing import run_opt_pass
 from tvm.script import tir as T
 
-from .infra import make_ethosu_binary_elementwise, make_ethosu_conv2d
-
+from .infra import make_ethosu_binary_elementwise, make_ethosu_conv2d, copy_allocate_const_data
 
 # fmt: off
 @tvm.script.ir_module
@@ -41,11 +41,16 @@ def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_writ
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         placeholder = T.Buffer([8192], "int8", data=input_placeholder.data)
         ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
-        buffer1 = T.Buffer([160], "uint8")
-        buffer3 = T.Buffer([144], "uint8")
-        buffer5 = T.Buffer([144], "uint8")
-        buffer7 = T.Buffer([144], "uint8")
-        buffer8 = T.Buffer([32], "uint8")
+        data7 = T.allocate_const([0]*144,'uint8',[144])
+        buffer7 = T.Buffer([144], "uint8", data=data7)
+        data5 = T.allocate_const([0]*144,'uint8',[144])
+        buffer5 = T.Buffer([144], "uint8", data=data5)
+        data3 = T.allocate_const([0]*144,'uint8',[144])
+        buffer3 = T.Buffer([144], "uint8", data=data3)
+        data1 = T.allocate_const([0]*160,'uint8',[160])
+        buffer1 = T.Buffer([160], "uint8", data=data1)
+        # data8 = T.allocate_const([0]*32,'uint8',[32])
+        # buffer8 = T.Buffer([32], "uint8" , data=data8)
         # body
         p1_data = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
         p1 = T.Buffer([160], "uint8", data=p1_data)
@@ -68,22 +73,32 @@ class WeightStreamOnlyU65:
     @T.prim_func
     def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")):
         T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
+
+        data_encoded_3 = T.allocate_const([0]*192, 'uint8',[192])
+        buffer_encoded_3 = T.Buffer((192,), "uint8", data=data_encoded_3)
+        data_encoded_2 = T.allocate_const([0]*208, 'uint8',[208])
+        buffer_encoded_2 = T.Buffer((208,), "uint8",data=data_encoded_2)
+        data_encoded_1 = T.allocate_const([0]*192, 'uint8',[192])
+        buffer_encoded_1 = T.Buffer((192,), "uint8",data=data_encoded_1)
+        data_encoded = T.allocate_const([0]*192, 'uint8',[192])
+        buffer_encoded = T.Buffer((192,), "uint8", data=data_encoded)
+
         p2_global_6 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
         p2_global_4 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
         p2_global_5 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
-        buffer_encoded = T.Buffer((192,), "uint8")
+
         p2_global_3 = T.Buffer((192,), "uint8", data=p2_global_6)
         T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 192, p2_global_3[0])
-        buffer_encoded_1 = T.Buffer((192,), "uint8")
+
         p2_global_4_1 = T.Buffer((192,), "uint8", data=p2_global_4)
         T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 192, p2_global_4_1[0])
-        buffer_encoded_2 = T.Buffer((208,), "uint8")
+
         p2_global_5_1 = T.Buffer((208,), "uint8", data=p2_global_5)
         T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 208, p2_global_5_1[0])
         ifm_1 = T.Buffer((8192,), "int8", data=ifm.data)
         ethosu_write_1 = T.Buffer((2048,), "int8", data=ethosu_write.data)
         T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_3[0], 80, p2_global_3[80], 80, 12, p2_global_3[160], 16, p2_global_3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
-        buffer_encoded_3 = T.Buffer((192,), "uint8")
+
         p2_global_6_1 = T.Buffer((192,), "uint8", data=p2_global_6)
         T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 192, p2_global_6_1[0])
         T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_1[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2_global_4_1[0], 80, p2_global_4_1[80], 80, 12, p2_global_4_1[160], 16, p2_global_4_1[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
@@ -140,12 +155,13 @@ def _get_func():
     }
     with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
         func = _get_func()
-        mod, consts = _lower_to_tir(func, cascader=_planner)
+        mod = _lower_to_tir(func, cascader=_planner)
         script = mod.script()
         test_mod = tvm.script.from_source(script)
+        reference_mod = copy_allocate_const_data(test_mod, reference_mod)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
-        test_const_size = [value.size for value in list(consts.values())]
+        test_const_size = [value.shape[0] for value in collect_consts(test_mod).values()]
         assert reference_const_sizes.sort() == test_const_size.sort()
 
 
@@ -156,7 +172,10 @@ class RereadWeightsU55:
     def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write: T.Buffer((1, 16, 16, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.Buffer([384], "uint8")
+
+        data1 = T.allocate_const([0]*384, 'uint8',[384])
+        buffer1 = T.Buffer([384], "uint8",data=data1)
+
         placeholder = T.Buffer([8192], "int8", data=input_placeholder.data)
         ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
         # body
@@ -177,10 +196,13 @@ class RereadWeightsU65:
     def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write: T.Buffer((1, 16, 16, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        # buffer definition
+
+        data_encoded_1 = T.allocate_const([0]*464, 'uint8',[464])
+        placeholder_encoded_1 = T.Buffer([464], "uint8", data=data_encoded_1)
+
         placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
         ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
-        placeholder_encoded_1 = T.Buffer([464], "uint8")
+
         # body
         p1_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
         p1 = T.Buffer([464], "uint8", data=p1_data)
@@ -242,12 +264,13 @@ def _get_func():
     }
     with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
         func = _get_func()
-        mod, consts = _lower_to_tir(func, cascader=_cascader)
+        mod = _lower_to_tir(func, cascader=_cascader)
         script = mod.script()
         test_mod = tvm.script.from_source(script)
+        reference_mod = copy_allocate_const_data(test_mod, reference_mod)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
-        test_const_size = [value.size for value in list(consts.values())]
+        test_const_size = [value.shape[0] for value in collect_consts(test_mod).values()]
         assert reference_const_sizes.sort() == test_const_size.sort()
 
 
@@ -258,10 +281,14 @@ class DirectReadOnlyU55:
     def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write: T.Buffer((1, 16, 16, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([592], "uint8")
-        buffer_1 = T.Buffer([160], "uint8")
-        buffer_2 = T.Buffer([160], "uint8")
-        buffer_3 = T.Buffer([80], "uint8")
+        data_3 = T.allocate_const([0]*80, 'uint8', [80])
+        buffer_3 = T.Buffer([80], "uint8",data=data_3)
+        data_2 = T.allocate_const([0]*160, 'uint8', [160])
+        buffer_2 = T.Buffer([160], "uint8",data=data_2)
+        data_1 = T.allocate_const([0]*160, 'uint8', [160])
+        buffer_1 = T.Buffer([160], "uint8",data=data_1)
+        data = T.allocate_const([0]*592, 'uint8', [592])
+        buffer = T.Buffer([592], "uint8",data=data)
         placeholder = T.Buffer([8192], "int8", data=input_placeholder.data)
         ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
         # body
@@ -279,10 +306,16 @@ def main(input_placeholder: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_writ
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        placeholder_encoded = T.Buffer([608], dtype="uint8")
-        placeholder_encoded_1 = T.Buffer([160], dtype="uint8")
-        placeholder_encoded_2 = T.Buffer([208], dtype="uint8")
-        placeholder_encoded_3 = T.Buffer([96], dtype="uint8")
+
+        data_3 = T.allocate_const([0]*96, 'uint8',[96])
+        placeholder_encoded_3 = T.Buffer([96], dtype="uint8" , data=data_3)
+        data_2 = T.allocate_const([0]*208, 'uint8',[208])
+        placeholder_encoded_2 = T.Buffer([208], dtype="uint8", data=data_2)
+        data_1 = T.allocate_const([0]*160, 'uint8',[160])
+        placeholder_encoded_1 = T.Buffer([160], dtype="uint8", data=data_1)
+        data = T.allocate_const([0]*608, 'uint8',[608])
+        placeholder_encoded = T.Buffer([608], dtype="uint8"  , data=data)
+
         placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
         ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
         # body
@@ -339,13 +372,14 @@ def _get_func():
     }
     with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
         func = _get_func()
-        mod, consts = _lower_to_tir(func)
+        mod = _lower_to_tir(func)
 
         script = mod.script()
         test_mod = tvm.script.from_source(script)
+        reference_mod = copy_allocate_const_data(test_mod, reference_mod)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
-        test_const_size = [value.size for value in list(consts.values())]
+        test_const_size = [value.shape[0] for value in collect_consts(test_mod).values()]
         assert reference_const_sizes.sort() == test_const_size.sort()
 
 
@@ -356,12 +390,20 @@ class MixedReadU55:
     def main(input_ifm: T.Buffer((1,16,16,32), "int8"), input_ethosu_write: T.Buffer((1,16,16,8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.Buffer([112], "uint8")
-        buffer3 = T.Buffer([112], "uint8")
-        buffer5 = T.Buffer([112], "uint8")
-        buffer7 = T.Buffer([112], "uint8")
-        buffer9 = T.Buffer([592], "uint8")
-        buffer10 = T.Buffer([160], "uint8")
+
+        data7 = T.allocate_const([0]*112, 'uint8', [112])
+        buffer7 = T.Buffer([112], "uint8" , data=data7)
+        data5 = T.allocate_const([0]*112, 'uint8', [112])
+        buffer5 = T.Buffer([112], "uint8" , data=data5)
+        data3 = T.allocate_const([0]*112, 'uint8', [112])
+        buffer3 = T.Buffer([112], "uint8" , data=data3)
+        data1 = T.allocate_const([0]*112, 'uint8', [112])
+        buffer1 = T.Buffer([112], "uint8" , data=data1)
+        data10 = T.allocate_const([0]*160, 'uint8', [160])
+        buffer10 = T.Buffer([160], "uint8", data=data10)
+        data9 = T.allocate_const([0]*592, 'uint8', [592])
+        buffer9 = T.Buffer([592], "uint8" , data=data9)
+
         ifm = T.Buffer([8192], "int8", data=input_ifm.data)
         ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
         # body
@@ -388,27 +430,36 @@ class MixedReadU65:
     @T.prim_func
     def main(ifm: T.Buffer((1, 16, 16, 32), "int8"), ethosu_write: T.Buffer((1, 16, 16, 8), "int8")):
         T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
+
+        data_encoded_3 = T.allocate_const([0]*128, 'uint8',[128])
+        buffer_encoded_3 = T.Buffer((128,), "uint8", data=data_encoded_3)
+        data_encoded_2 = T.allocate_const([0]*128, 'uint8',[128])
+        buffer_encoded_2 = T.Buffer((128,), "uint8", data=data_encoded_2)
+        data_encoded_1 = T.allocate_const([0]*128, 'uint8',[128])
+        buffer_encoded_1 = T.Buffer((128,), "uint8", data=data_encoded_1)
+        data_encoded = T.allocate_const([0]*128, 'uint8',[128])
+        buffer_encoded = T.Buffer((128,), "uint8", data=data_encoded)
+        p2_encoded_data = T.allocate_const([0]*160, 'uint8',[160])
+        p2_encoded = T.Buffer((160,), "uint8", data=p2_encoded_data)
+        p1_encoded_data = T.allocate_const([0]*608, 'uint8',[608])
+        p1_encoded = T.Buffer((608,), "uint8", data=p1_encoded_data)
+
+
         p5_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
         p5_global_1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin": T.bool(True)})
         p5_global_2 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
-        buffer_encoded = T.Buffer((128,), "uint8")
         p5_global_3 = T.Buffer((128,), "uint8", data=p5_global)
         T.call_extern("handle", "ethosu_copy", buffer_encoded[0], 128, p5_global_3[0])
-        buffer_encoded_1 = T.Buffer((128,), "uint8")
         p5_global_4 = T.Buffer((128,), "uint8", data=p5_global_1)
         T.call_extern("handle", "ethosu_copy", buffer_encoded_1[0], 128, p5_global_4[0])
         ifm_1 = T.Buffer((8192,), "int8", data=ifm.data)
         ethosu_write_2 = T.Buffer((4096,), "int8", data=ethosu_write_1)
-        p1_encoded = T.Buffer((608,), "uint8")
-        p2_encoded = T.Buffer((160,), "uint8")
         T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, p1_encoded[0], 304, p1_encoded[304], 304, 12, p2_encoded[0], 80, p2_encoded[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
-        buffer_encoded_2 = T.Buffer((128,), "uint8")
         p5_global_5 = T.Buffer((128,), "uint8", data=p5_global_2)
         T.call_extern("handle", "ethosu_copy", buffer_encoded_2[0], 128, p5_global_5[0])
         ethosu_write_3 = T.Buffer((2048,), "int8", data=ethosu_write.data)
         T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_3[0], 48, p5_global_3[48], 48, 12, p5_global_3[96], 16, p5_global_3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
-        buffer_encoded_3 = T.Buffer((128,), "uint8")
         p5_global_6 = T.Buffer((128,), "uint8", data=p5_global)
         T.call_extern("handle", "ethosu_copy", buffer_encoded_3[0], 128, p5_global_6[0])
         T.call_extern("handle", "ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write_3[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5_global_4[0], 48, p5_global_4[48], 48, 12, p5_global_4[96], 16, p5_global_4[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0)
@@ -474,13 +525,14 @@ def _get_func():
     }
     with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
         func = _get_func()
-        mod, consts = _lower_to_tir(func, cascader=_planner)
+        mod = _lower_to_tir(func, cascader=_planner)
 
         script = mod.script()
         test_mod = tvm.script.from_source(script)
+        reference_mod = copy_allocate_const_data(test_mod, reference_mod)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
-        test_const_size = [value.size for value in list(consts.values())]
+        test_const_size = [value.shape[0] for value in collect_consts(test_mod).values()]
         assert reference_const_sizes.sort() == test_const_size.sort()
 
 
@@ -508,11 +560,14 @@ def get_graph():
         func = run_opt_pass(func, relay.transform.InferType())
         return func
 
-    tir_mod, params = _lower_to_tir(get_graph(), copy_constants())
+    tir_mod = _lower_to_tir(get_graph(), copy_constants())
 
     # Check tile address for the scalar constant input hasn't been
     # overwritten.
-    extern_calls = tir_mod["main"].body.body.body.body
+
+    extern_calls = tir_mod["main"].body
+    while isinstance(extern_calls, (tvm.tir.Allocate, tvm.tir.AllocateConst)):
+        extern_calls = extern_calls.body
     binary_elementwise = extern_calls[-1].value
     args = binary_elementwise.args
 
@@ -529,6 +584,10 @@ def get_graph():
         "target", tvm.target.Target("ethos-u", host="ethos-u")
     )
     tir_mod = tvm.tir.transform.MakeUnpackedAPI()(tir_mod)
+    params = {
+        buffer_var: ndarray.numpy() for buffer_var, ndarray in collect_consts(tir_mod).items()
+    }
+
     tir_to_cs_translator.translate(tir_mod, params)
 
 
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index ef034930d7bc..393c1e519d21 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -24,7 +24,7 @@
 from tvm.relay.testing import run_opt_pass
 from tvm.script import tir as T
 
-from .infra import make_ethosu_conv2d
+from .infra import make_ethosu_conv2d, copy_allocate_const_data
 
 
 # fmt: off
@@ -35,18 +35,29 @@ def main(input_placeholder: T.Buffer((1,8,12,16), "int8"), input_placeholder_1:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
 
+        data_3 = T.allocate_const([0]*160, 'uint8', [160])
+        data_2 = T.allocate_const([0]*2992, 'uint8', [2992])
+        data_7 = T.allocate_const([0]*160, 'uint8', [160])
+        data_6 = T.allocate_const([0]*2992, 'uint8', [2992])
+        data_1 = T.allocate_const([0]*160, 'uint8', [160])
+        data = T.allocate_const([0]*2992, 'uint8', [2992])
+        data_5 = T.allocate_const([0]*160, 'uint8', [160])
+        data_4 = T.allocate_const([0]*2992, 'uint8', [2992])
+
+        buffer = T.Buffer([2992], "uint8", data=data)
+        buffer_1 = T.Buffer([160], "uint8", data=data_1)
+        buffer_2 = T.Buffer([2992], "uint8", data=data_2)
+        buffer_3 = T.Buffer([160], "uint8", data=data_3)
+        buffer_4 = T.Buffer([2992], "uint8", data=data_4)
+        buffer_5 = T.Buffer([160], "uint8", data=data_5)
+        buffer_6 = T.Buffer([2992], "uint8", data=data_6)
+        buffer_7 = T.Buffer([160], "uint8", data=data_7)
+
         placeholder = T.Buffer(1536, dtype="int8", data=input_placeholder.data)
         placeholder_1 = T.Buffer(1280, dtype="int8", data=input_placeholder_1.data)
         T_concat = T.Buffer(4096, dtype="int8", data=input_T_concat.data)
 
-        buffer = T.Buffer([2992], "uint8")
-        buffer_1 = T.Buffer([160], "uint8")
-        buffer_2 = T.Buffer([2992], "uint8")
-        buffer_3 = T.Buffer([160], "uint8")
-        buffer_4 = T.Buffer([2992], "uint8")
-        buffer_5 = T.Buffer([160], "uint8")
-        buffer_6 = T.Buffer([2992], "uint8")
-        buffer_7 = T.Buffer([160], "uint8")
+
         # body
         T_concat_1_data = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
         T_concat_1 = T.Buffer([2816], "int8", data=T_concat_1_data)
@@ -73,11 +84,12 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     script = mod.script()
     test_mod = tvm.script.from_source(script)
 
     reference_mod = ReferenceModule
+    reference_mod = copy_allocate_const_data(test_mod, reference_mod)
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
index dd388109466f..d3ea098e9082 100644
--- a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
@@ -71,7 +71,7 @@ def test_binary_elementwise_single(
     )
     func = relay.Function(relay.analysis.free_vars(binary_elementwise), binary_elementwise)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
@@ -229,7 +229,7 @@ def test_shift_binary_elementwise_single(
     )
     func = relay.Function(relay.analysis.free_vars(binary_elementwise), binary_elementwise)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 32d1303e124e..3293f98b317e 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -24,7 +24,7 @@
 from tvm.relay.testing import run_opt_pass
 from tvm.script import tir as T
 
-from .infra import make_ethosu_conv2d
+from .infra import make_ethosu_conv2d, copy_allocate_const_data
 
 
 def _create_serial_conv2d_params(
@@ -350,7 +350,7 @@ def _get_func(
         [(1, 2, 12, 9, 16), 182, 67, (1, 3), (6, 3), (2, 2), (1, 1), "CLIP", "NHCWB16", "NHCWB16"],
     ]
     func = _get_func(*trial)
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
@@ -370,10 +370,14 @@ class Conv2dDoubleCascade1:
     def main(input_placeholder_5: T.Buffer((1, 8, 8, 3), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 8, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([304], "uint8")
-        buffer_1 = T.Buffer([80], "uint8")
-        buffer_2 = T.Buffer([320], "uint8")
-        buffer_3 = T.Buffer([160], "uint8")
+        data_1 = T.allocate_const([0]*80, "uint8", [80])
+        buffer_1 = T.Buffer([80], "uint8", data=data_1)
+        data = T.allocate_const([0]*304, "uint8", [304])
+        buffer = T.Buffer([304], "uint8", data=data)
+        data_2 = T.allocate_const([0]*320, "uint8", [320])
+        buffer_2 = T.Buffer([320], "uint8", data=data_2)
+        data_3 = T.allocate_const([0]*160, "uint8", [160])
+        buffer_3 = T.Buffer([160], "uint8", data=data_3)
         placeholder_5 = T.Buffer([192], 'int8', data=input_placeholder_5.data)
         ethosu_write_1 = T.Buffer([512], 'int8', data=input_ethosu_write_1.data)
         # body
@@ -392,10 +396,14 @@ class Conv2dDoubleCascade2:
     def main(input_placeholder_5: T.Buffer((1, 8, 8, 3), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 8, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([80], "uint8")
-        buffer_1 = T.Buffer([320], "uint8")
-        buffer_2 = T.Buffer([1312], "uint8")
-        buffer_3 = T.Buffer([2608], "uint8")
+        data = T.allocate_const([0]*80, "uint8", [80])
+        buffer = T.Buffer([80], "uint8",data=data)
+        data_3 = T.allocate_const([0]*2608, "uint8", [2608])
+        buffer_3 = T.Buffer([2608], "uint8",data=data_3)
+        data_1 = T.allocate_const([0]*320, "uint8", [320])
+        buffer_1 = T.Buffer([320], "uint8", data=data_1)
+        data_2 = T.allocate_const([0]*1312, "uint8", [1312])
+        buffer_2 = T.Buffer([1312], "uint8", data=data_2)
         placeholder_5 = T.Buffer([192], 'int8', data=input_placeholder_5.data)
         ethosu_write_1 = T.Buffer([512], 'int8', data=input_ethosu_write_1.data)
         # body
@@ -414,10 +422,14 @@ class Conv2dDoubleCascade3:
     def main(input_placeholder_5: T.Buffer((1, 16, 16, 3), "int8"), input_ethosu_write_1: T.Buffer((1, 20, 4, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([1744], "uint8")
-        buffer_1 = T.Buffer([80], "uint8")
-        buffer_2 = T.Buffer([320], "uint8")
-        buffer_3 = T.Buffer([880], "uint8")
+        data_1 = T.allocate_const([0]*80, "uint8", [80])
+        buffer_1 = T.Buffer([80], "uint8", data=data_1)
+        data = T.allocate_const([0]*1744, "uint8", [1744])
+        buffer = T.Buffer([1744], "uint8",data=data)
+        data_2 = T.allocate_const([0]*320, "uint8", [320])
+        buffer_2 = T.Buffer([320], "uint8", data=data_2)
+        data_3 = T.allocate_const([0]*880, "uint8", [880])
+        buffer_3 = T.Buffer([880], "uint8",data=data_3)
         placeholder_5 = T.Buffer([768], 'int8', data=input_placeholder_5.data)
         ethosu_write_1 = T.Buffer([640], 'int8', data=input_ethosu_write_1.data)
 
@@ -439,10 +451,14 @@ class Conv2dDoubleCascade4:
     def main(input_placeholder_5: T.Buffer((1, 8, 1, 8, 16), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 2, 8, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([1456], "uint8")
-        buffer_1 = T.Buffer([352], "uint8")
-        buffer_2 = T.Buffer([272], "uint8")
-        buffer_3 = T.Buffer([11040], "uint8")
+        data_2 = T.allocate_const([0]*272, "uint8", [272])
+        buffer_2 = T.Buffer([272], "uint8"  ,data=data_2)
+        data_3 = T.allocate_const([0]*11040, "uint8", [11040])
+        buffer_3 = T.Buffer([11040], "uint8",data=data_3)
+        data_1 = T.allocate_const([0]*352, "uint8", [352])
+        buffer_1 = T.Buffer([352], "uint8"  ,data=data_1)
+        data = T.allocate_const([0]*1456, "uint8", [1456])
+        buffer = T.Buffer([1456], "uint8",data=data)
         placeholder_5 = T.Buffer([1024], 'int8', data=input_placeholder_5.data)
         ethosu_write_1 = T.Buffer([2048], 'int8', data=input_ethosu_write_1.data)
         # body
@@ -461,10 +477,14 @@ class Conv2dDoubleCascade5:
     def main(input_placeholder: T.Buffer((1, 8, 8, 3), "int8"), input_ethosu_write: T.Buffer((1, 32, 32, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([160], "uint8")
-        buffer_1 = T.Buffer([320], "uint8")
-        buffer_2 = T.Buffer([304], "uint8")
-        buffer_3 = T.Buffer([80], "uint8")
+        data_3 = T.allocate_const([0]*80, "uint8", [80])
+        buffer_3 = T.Buffer([80], "uint8",data=data_3)
+        data_2 = T.allocate_const([0]*304, "uint8", [304])
+        buffer_2 = T.Buffer([304], "uint8",data=data_2)
+        data_1 = T.allocate_const([0]*320, "uint8", [320])
+        buffer_1 = T.Buffer([320], "uint8",data=data_1)
+        data = T.allocate_const([0]*160, "uint8", [160])
+        buffer = T.Buffer([160], "uint8",data=data)
         placeholder = T.Buffer([192], 'int8', data=input_placeholder.data)
         ethosu_write = T.Buffer([8192], 'int8', data=input_ethosu_write.data)
         # body
@@ -483,10 +503,14 @@ class Conv2dDoubleCascade6:
     def main(input_placeholder: T.Buffer((1, 8, 1, 8, 16), "int8"), input_ethosu_write: T.Buffer((1, 32, 2, 32, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([1456], "uint8")
-        buffer_1 = T.Buffer([352], "uint8")
-        buffer_2 = T.Buffer([11040], "uint8")
-        buffer_3 = T.Buffer([272], "uint8")
+        data_3 = T.allocate_const([0]*272, "uint8", [272])
+        buffer_3 = T.Buffer([272], "uint8",data=data_3)
+        data_2 = T.allocate_const([0]*11040, "uint8", [11040])
+        buffer_2 = T.Buffer([11040], "uint8",data=data_2)
+        data_1 = T.allocate_const([0]*352, "uint8", [352])
+        buffer_1 = T.Buffer([352], "uint8",data=data_1)
+        data = T.allocate_const([0]*1456, "uint8", [1456])
+        buffer = T.Buffer([1456], "uint8",data=data)
         placeholder = T.Buffer([1024], 'int8', data=input_placeholder.data)
         ethosu_write = T.Buffer([32768], 'int8', data=input_ethosu_write.data)
         # body
@@ -638,9 +662,10 @@ def _get_func(
     }
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-u.options": config}):
         func = _get_func(*params[:-1])
-        mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
+        mod = _lower_to_tir(func, cascader=total_cascader(params[-1]))
         script = mod.script()
         mod = tvm.script.from_source(script)
+        reference_mod = copy_allocate_const_data(mod, reference_mod)
         tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
 
 
@@ -697,7 +722,7 @@ def _get_func(ifm_shape, lower, upper, ofm_channels=16):
     reference_mod = trial[0]
     params = trial[1:]
     func = _get_func(*params)
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     script = mod.script()
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
@@ -799,7 +824,7 @@ def _get_func(ifm_shape, reshaped, ifm_layout):
     reference_mod = trial[0]
     params = trial[1:]
     func = _get_func(*params)
-    mod, _ = _lower_to_tir(func, cascader=total_cascader((1, 4, 6, 16)))
+    mod = _lower_to_tir(func, cascader=total_cascader((1, 4, 6, 16)))
     script = mod.script()
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
@@ -819,7 +844,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = _lower_to_tir(func, cascader=total_cascader((1, 4, 4, 16)))
+    mod = _lower_to_tir(func, cascader=total_cascader((1, 4, 4, 16)))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 94763c5d3fbf..9f35a46915b5 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -27,7 +27,7 @@
 from tvm.relay.testing import run_opt_pass
 from tvm.script import tir as T
 
-from .infra import make_ethosu_conv2d
+from .infra import make_ethosu_conv2d, copy_allocate_const_data
 
 
 # fmt: off
@@ -37,7 +37,10 @@ class ReferenceModule:
     def main(input_placeholder_3: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write_1: T.Buffer((1, 16, 16, 8), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer_1 = T.Buffer([384], "uint8")
+
+        data_1 = T.allocate_const([0]*384,'uint8',[384])
+        buffer_1 = T.Buffer([384], "uint8",data=data_1)
+
         placeholder_3 = T.Buffer([8192], dtype="int8", data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([2048], dtype="int8", data=input_ethosu_write_1.data)
         # body
@@ -66,11 +69,12 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = _lower_to_tir(func, cascader=copy_constants())
+    mod = _lower_to_tir(func, cascader=copy_constants())
 
     script = mod.script()
     test_mod = tvm.script.from_source(script)
     reference_mod = ReferenceModule
+    reference_mod = copy_allocate_const_data(test_mod, reference_mod)
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 
@@ -81,8 +85,12 @@ class WeightStream:
     def main(input_placeholder_5: T.Buffer((1, 16, 16, 32), "int8"), input_ethosu_write_1: T.Buffer((1, 16, 16, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([528], "uint8")
-        buffer_2 = T.Buffer([336], "uint8")
+
+        data_2 = T.allocate_const([0]*336, 'uint8',[336])
+        buffer_2 = T.Buffer([336], "uint8",data=data_2)
+        data = T.allocate_const([0]*528, 'uint8',[528])
+        buffer = T.Buffer([528], "uint8",data=data)
+
         placeholder_5 = T.Buffer([8192], dtype="int8", data=input_placeholder_5.data)
         ethosu_write_1 = T.Buffer([4096], dtype="int8", data=input_ethosu_write_1.data)
         # body
@@ -126,11 +134,12 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = _lower_to_tir(func, cascader=_cascader)
+    mod = _lower_to_tir(func, cascader=_cascader)
 
     script = mod.script()
     test_mod = tvm.script.from_source(script)
     reference_mod = WeightStream
+    reference_mod = copy_allocate_const_data(test_mod, reference_mod)
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
index 32f75621fde0..28af89dee01a 100644
--- a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
@@ -111,7 +111,7 @@ def _get_func(
         return func
 
     func = _get_func(*trial)
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_replace_identity.py b/tests/python/contrib/test_ethosu/test_replace_identity.py
index 775ef1260665..30b987c735cb 100644
--- a/tests/python/contrib/test_ethosu/test_replace_identity.py
+++ b/tests/python/contrib/test_ethosu/test_replace_identity.py
@@ -33,7 +33,7 @@ def test_identity(ifm_shape):
 
     func = relay.Function(relay.analysis.free_vars(identity), identity)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_replace_pooling.py b/tests/python/contrib/test_ethosu/test_replace_pooling.py
index e4438eb62abd..8612bbb348ed 100644
--- a/tests/python/contrib/test_ethosu/test_replace_pooling.py
+++ b/tests/python/contrib/test_ethosu/test_replace_pooling.py
@@ -188,7 +188,7 @@ def test_avg_max_pooling_single(
     )
     func = relay.Function(relay.analysis.free_vars(pooling), pooling)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
@@ -244,7 +244,7 @@ def test_sum_pooling_single(
     )
     func = relay.Function(relay.analysis.free_vars(pooling), pooling)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
@@ -307,7 +307,7 @@ def test_correct_stride_with_multiple_pooling():
     )
     func = relay.Function(relay.analysis.free_vars(op), op)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
 
     data = []
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
index f61ace0d51ec..02271a0eea90 100644
--- a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
@@ -69,7 +69,7 @@ def test_unary_elementwise_single(
     )
     func = relay.Function(relay.analysis.free_vars(unary_elementwise), unary_elementwise)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = _lower_to_tir(func)
+    mod = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 1edd840b0b0e..62269a69c007 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -41,6 +41,7 @@
     make_ethosu_conv2d,
     make_ethosu_identity,
     make_ethosu_binary_elementwise,
+    copy_allocate_const_data,
 )
 
 
@@ -198,10 +199,15 @@ class DiamondGraphTir:
     @T.prim_func
     def main(input_placeholder: T.Buffer((1, 56, 56, 96), "int8"), input_ethosu_write: T.Buffer((1, 56, 56, 24), "int8")) -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+
+        data3 = T.allocate_const([0]*976, 'uint8', [976])
+        buffer3 = T.Buffer([976], "uint8", data=data3)
+        data1 = T.allocate_const([0]*2848, 'uint8', [2848])
+        buffer1 = T.Buffer([2848], "uint8", data=data1)
+
+
         placeholder = T.Buffer([301056], dtype='int8', data=input_placeholder.data)
         ethosu_write = T.Buffer([75264], dtype='int8', data=input_ethosu_write.data)
-        buffer1 = T.Buffer([2848], "uint8")
-        buffer3 = T.Buffer([976], "uint8")
         p1_data = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True})
         p1 = T.Buffer([2848], "uint8", data=p1_data)
         p2_data = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True})
@@ -228,8 +234,9 @@ def test_schedule_diamond_graph():
     func = relay.Function(relay.analysis.free_vars(add), add)
     func = run_opt_pass(func, relay.transform.InferType())
 
-    test_mod, _ = _lower_to_tir(func, copy_constants())
+    test_mod = _lower_to_tir(func, copy_constants())
     reference_mod = DiamondGraphTir
+    reference_mod = copy_allocate_const_data(test_mod, reference_mod)
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 
diff --git a/tests/python/unittest/test_tir_transform_make_packed_api.py b/tests/python/unittest/test_tir_transform_make_packed_api.py
index 2f871a246f53..c9c7df4dbc34 100644
--- a/tests/python/unittest/test_tir_transform_make_packed_api.py
+++ b/tests/python/unittest/test_tir_transform_make_packed_api.py
@@ -147,7 +147,10 @@ def test_device_api_context_implicit_resource_handle():
         lambda f: f.with_attr("target", tvm.target.Target("llvm", host="llvm"))
     )(mod)
     mod = tvm.tir.transform.Apply(lambda f: f.with_attr("global_symbol", "main"))(mod)
-    func = tvm.tir.transform.MakePackedAPI()(mod)["main"]
+    mod.show(name="before")
+    mod = tvm.tir.transform.MakePackedAPI()(mod)
+    mod.show(name="after")
+    func = mod["main"]
 
     num_args = func.params[2]
     device_context_in_resource_handle = func.params[5]

From 6b11704bbdebaaa47dfa3c3e27b8c0f571f273f9 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 27 Sep 2023 15:11:56 -0500
Subject: [PATCH 2/6] Use allocateconst for InlineReshape

---
 .../test_ethosu/test_replace_conv2d.py        | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 3293f98b317e..9aa22cf31d74 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -735,8 +735,11 @@ class Conv2dInlineReshape1:
     def main(input_placeholder_3: T.Buffer((4, 6, 8, 1), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 6, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([160], "uint8")
-        buffer_1 = T.Buffer([848], "uint8")
+
+        data = T.allocate_const([0]*192, 'uint8', 160)
+        buffer = T.Buffer([160], "uint8", data=data)
+        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        buffer_1 = T.Buffer([848], "uint8", data=data_1)
         placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
@@ -751,8 +754,10 @@ class Conv2dInlineReshape2:
     def main(input_placeholder_3: T.Buffer((1, 24, 8), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 6, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([160], "uint8")
-        buffer_1 = T.Buffer([848], "uint8")
+        data = T.allocate_const([0]*192, 'uint8', 160)
+        buffer = T.Buffer([160], "uint8", data=data)
+        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        buffer_1 = T.Buffer([848], "uint8", data=data_1)
         placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
@@ -767,8 +772,10 @@ class Conv2dInlineReshape3:
     def main(input_placeholder_3: T.Buffer((192, 1), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 6, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([160], "uint8")
-        buffer_1 = T.Buffer([848], "uint8")
+        data = T.allocate_const([0]*192, 'uint8', 160)
+        buffer = T.Buffer([160], "uint8", data=data)
+        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        buffer_1 = T.Buffer([848], "uint8", data=data_1)
         placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
@@ -783,8 +790,10 @@ class Conv2dInlineReshape4:
     def main(placeholder_3: T.Buffer((192,), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 6, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([160], "uint8")
-        buffer_1 = T.Buffer([848], "uint8")
+        data = T.allocate_const([0]*192, 'uint8', 160)
+        buffer = T.Buffer([160], "uint8", data=data)
+        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        buffer_1 = T.Buffer([848], "uint8", data=data_1)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -825,6 +834,7 @@ def _get_func(ifm_shape, reshaped, ifm_layout):
     params = trial[1:]
     func = _get_func(*params)
     mod = _lower_to_tir(func, cascader=total_cascader((1, 4, 6, 16)))
+    reference_mod = copy_allocate_const_data(mod, reference_mod)
     script = mod.script()
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)

From 3defbe38f3b94d7a4cab26049c3a92108d43deb8 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 27 Sep 2023 15:15:56 -0500
Subject: [PATCH 3/6] Walk past allocate_const in cascader.test_integration

---
 .../python/contrib/test_ethosu/cascader/test_integration.py  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py b/tests/python/contrib/test_ethosu/cascader/test_integration.py
index 29b994b0acbd..239382b5f200 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_integration.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_integration.py
@@ -124,7 +124,10 @@ def test_double_conv_compute_cycles_hint():
     for double convolution.
     """
     primfunc = _compile_model(_create_double_conv2d())
-    ops = primfunc.body.body.body.body.seq
+
+    ops = primfunc.body
+    while not isinstance(ops, tvm.tir.SeqStmt):
+        ops = ops.body
     compute_cycles_hints = [2944, 1408, 320, 240]
     for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
         assert op.attr_key == "pragma_compute_cycles_hint"

From edddfdc3bbf2ff665dbcbb2c86c70ddaaf1f612f Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 27 Sep 2023 15:20:27 -0500
Subject: [PATCH 4/6] Use allocate_const in tests of inline copy

---
 .../contrib/test_ethosu/test_replace_conv2d.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 9aa22cf31d74..3b473aac641e 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -676,8 +676,13 @@ class Conv2dInlineCopy1:
     def main(input_placeholder_3: T.Buffer((1, 10, 12, 8), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 8, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([848], "uint8")
-        buffer_1 = T.Buffer([160], "uint8")
+
+        data_1 = T.allocate_const([0]*160, 'uint8', [160])
+        buffer_1 = T.Buffer([160], "uint8", data=data_1)
+        data = T.allocate_const([0]*848, 'uint8', [848])
+        buffer = T.Buffer([848], "uint8", data=data)
+
+
         placeholder_3 = T.Buffer([960], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([1024], 'int8', data=input_ethosu_write_1.data)
         # body
@@ -691,8 +696,13 @@ class Conv2dInlineCopy2:
     def main(input_placeholder_3: T.Buffer((1, 7, 9, 5), "int8"), input_ethosu_write_1: T.Buffer((1, 3, 5, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.Buffer([160], "uint8")
-        buffer_1 = T.Buffer([656], "uint8")
+
+        data_1 = T.allocate_const([0]*656, 'uint8', [656])
+        buffer_1 = T.Buffer([656], "uint8", data=data_1)
+        data = T.allocate_const([0]*160, 'uint8', [160])
+        buffer = T.Buffer([160], "uint8", data=data)
+
+
         placeholder_3 = T.Buffer([315], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([240], 'int8', data=input_ethosu_write_1.data)
         # body

From 0bf074233ffa885610faa1742c15e956b2b2cf77 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 27 Sep 2023 15:35:56 -0500
Subject: [PATCH 5/6] Disable check on get_source

---
 tests/python/contrib/test_ethosu/test_codegen.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index e094bb74b2e1..5bf9dfbccdd0 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1081,10 +1081,11 @@ def depthwise_conv2d(x):
         '__attribute__((section(".rodata.tvm"), aligned(16))) static int8_t tvmgen_default_ethos_u_main_0_cms_data_data'
         in source
     )
-    assert (
-        '__attribute__((section(".rodata.tvm"), aligned(16))) static int8_t tvmgen_default_ethos_u_main_0_weights'
-        in source
-    )
+    # The weights are now encoded by TVM in the AllocateConst node.
+    # assert (
+    #     '__attribute__((section(".rodata.tvm"), aligned(16))) static int8_t tvmgen_default_ethos_u_main_0_weights'
+    #     in source
+    # )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)

From 54a8c90c195f001427ee4241244e90143bca5b48 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Thu, 28 Sep 2023 09:00:51 -0500
Subject: [PATCH 6/6] Correct arguments for T.allocate_const

---
 .../contrib/test_ethosu/test_replace_conv2d.py   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 3b473aac641e..b2d9921e67fe 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -746,9 +746,9 @@ def main(input_placeholder_3: T.Buffer((4, 6, 8, 1), "int8"), input_ethosu_write
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
 
-        data = T.allocate_const([0]*192, 'uint8', 160)
+        data = T.allocate_const([0]*192, 'uint8', [160])
         buffer = T.Buffer([160], "uint8", data=data)
-        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        data_1 = T.allocate_const([0]*848, 'uint8', [848])
         buffer_1 = T.Buffer([848], "uint8", data=data_1)
         placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
@@ -764,9 +764,9 @@ class Conv2dInlineReshape2:
     def main(input_placeholder_3: T.Buffer((1, 24, 8), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 6, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        data = T.allocate_const([0]*192, 'uint8', 160)
+        data = T.allocate_const([0]*192, 'uint8', [160])
         buffer = T.Buffer([160], "uint8", data=data)
-        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        data_1 = T.allocate_const([0]*848, 'uint8', [848])
         buffer_1 = T.Buffer([848], "uint8", data=data_1)
         placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
@@ -782,9 +782,9 @@ class Conv2dInlineReshape3:
     def main(input_placeholder_3: T.Buffer((192, 1), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 6, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        data = T.allocate_const([0]*192, 'uint8', 160)
+        data = T.allocate_const([0]*192, 'uint8', [160])
         buffer = T.Buffer([160], "uint8", data=data)
-        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        data_1 = T.allocate_const([0]*848, 'uint8', [848])
         buffer_1 = T.Buffer([848], "uint8", data=data_1)
         placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
@@ -800,9 +800,9 @@ class Conv2dInlineReshape4:
     def main(placeholder_3: T.Buffer((192,), "int8"), input_ethosu_write_1: T.Buffer((1, 8, 6, 16), "int8")) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        data = T.allocate_const([0]*192, 'uint8', 160)
+        data = T.allocate_const([0]*192, 'uint8', [160])
         buffer = T.Buffer([160], "uint8", data=data)
-        data_1 = T.allocate_const([0]*848, 'uint8', 848)
+        data_1 = T.allocate_const([0]*848, 'uint8', [848])
         buffer_1 = T.Buffer([848], "uint8", data=data_1)
         ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body