From e823988bfea1ebe30cee09915a172123a9a70d64 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Mon, 17 Oct 2022 16:20:18 -0700
Subject: [PATCH 1/8] [Hexagon] Add a test to show how to use multi input async
 dma pipelining.

---
 .../test_hexagon/test_async_dma_pipeline.py   | 58 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index d05e0a6e9216..f2b27208de5a 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -72,9 +72,23 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
     return tvm.tir.Schedule(operator)
 
 
-def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy=0):
+def evaluate(
+    hexagon_session,
+    sch,
+    a,
+    b,
+    size_a,
+    expected_output,
+    use_async_copy=0,
+    merge_async_commit_queue_scope=False,
+):
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
-    with tvm.transform.PassContext(config={"tir.use_async_copy": use_async_copy}):
+    with tvm.transform.PassContext(
+        config={
+            "tir.use_async_copy": use_async_copy,
+            "tir.merge_async_commit_queue_scope": merge_async_commit_queue_scope,
+        }
+    ):
         func_tir = tvm.build(
             sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
         )
@@ -259,6 +273,29 @@ def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
     return sch
 
 
+def get_multi_input_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
+    sch = conv_approximation(size_a, size_w)
+
+    compute_block = sch.get_block("C")
+
+    n = sch.get_loops(compute_block)[0]
+    no, _ = sch.split(n, [blocks, None])
+
+    cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm")
+    sch.compute_at(cache_read_block_a, no)
+    sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
+
+    cache_read_block_b = sch.cache_read(compute_block, 1, "global.vtcm")
+    sch.compute_at(cache_read_block_b, no)
+    sch.fuse(*sch.get_loops(cache_read_block_b)[1:])
+
+    cache_read_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
+    sch.reverse_compute_at(cache_read_block_c, no)
+    sch.fuse(*sch.get_loops(cache_read_block_c)[1:])
+
+    return sch
+
+
 def print_results(test_key, runtimes):
     print(test_key)
     for runtime in runtimes.items():
@@ -321,6 +358,22 @@ def test_loading_vtcm_for_vrmpy(
             hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
         )
 
+        sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w)
+        n = sch.get_loops(sch.get_block("C"))[0]
+        sch.annotate(n, "software_pipeline_stage", [0, 0, 1, 2])
+        sch.annotate(n, "software_pipeline_order", [0, 1, 2, 3])
+        sch.annotate(n, "software_pipeline_async_stages", [0, 2])
+        async_multi_input_output_runtime = evaluate(
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            size_a,
+            expected_output,
+            use_async_copy=1,
+            merge_async_commit_queue_scope=False,
+        )
+
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
         n = sch.get_loops(sch.get_block("C"))[0]
         sch.annotate(n, "software_pipeline_stage", [0, 1, 2])
@@ -349,5 +402,6 @@ def test_loading_vtcm_for_vrmpy(
                 "async_dma_input": async_input_runtime,
                 "async_dma_output": async_output_runtime,
                 "async_dma_input_output": async_input_output_runtime,
+                "async_dma_multi_input_output": async_multi_input_output_runtime,
             },
         )

From c040a7fdb8586e949fd5a87cc575ff7be480f846 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Wed, 19 Oct 2022 13:06:42 -0700
Subject: [PATCH 2/8] updates to variable naming and removal of 4.19MB test
 that fails on 888 devices.

---
 .../test_hexagon/test_async_dma_pipeline.py        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index f2b27208de5a..97e70b75c799 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -266,9 +266,9 @@ def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
     sch.compute_at(cache_read_block_a, no)
     sch.fuse(*sch.get_loops(cache_read_block_a)[1:])
 
-    cache_read_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
-    sch.reverse_compute_at(cache_read_block_c, no)
-    sch.fuse(*sch.get_loops(cache_read_block_c)[1:])
+    cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
+    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
 
     return sch
 
@@ -289,9 +289,9 @@ def get_multi_input_fake_conv_vtcm_schedule(size_a, size_w, blocks=2):
     sch.compute_at(cache_read_block_b, no)
     sch.fuse(*sch.get_loops(cache_read_block_b)[1:])
 
-    cache_read_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
-    sch.reverse_compute_at(cache_read_block_c, no)
-    sch.fuse(*sch.get_loops(cache_read_block_c)[1:])
+    cache_write_block_c = sch.cache_write(compute_block, 0, "global.vtcm")
+    sch.reverse_compute_at(cache_write_block_c, no)
+    sch.fuse(*sch.get_loops(cache_write_block_c)[1:])
 
     return sch
 
@@ -308,7 +308,7 @@ class TestAsyncDMAPipeline:
     size_a = tvm.testing.parameter(
         1024,
         64 * 64,
-        128 * 128,
+        64 * 128,
     )
 
     size_w = tvm.testing.parameter(

From 9753ef9b3e26edbd30b29f20d5ccb691891339f8 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Tue, 25 Oct 2022 10:38:50 -0700
Subject: [PATCH 3/8] [Hexagon] Add test cases for adding async dma pipleining
 to metaschedule generated conv2d.

---
 .../test_hexagon/test_async_dma_pipeline.py   | 212 +++++++++++++++++-
 1 file changed, 210 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 97e70b75c799..c4b1489fa47c 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -24,8 +24,6 @@
 from tvm.script import tir as T
 from numpy.random import default_rng
 
-from tvm.tir.function import TensorIntrin
-
 VRMPY_SIZE_B = 128
 VRMPY_SIZE_INT32 = 32
 
@@ -405,3 +403,213 @@ def test_loading_vtcm_for_vrmpy(
                 "async_dma_multi_input_output": async_multi_input_output_runtime,
             },
         )
+
+# from tvm.script import tir as T
+@tvm.script.ir_module
+class ModulePipelined:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32", scope="global.vtcm")
+        p0_global_vtcm = T.alloc_buffer([1, 1, 230, 230, 4], dtype="uint8", scope="global.vtcm")
+        p1_global_vtcm = T.alloc_buffer([2, 1, 7, 7, 1, 32, 4], dtype="int8", scope="global.vtcm")
+        for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(2, 1, 7, 7, 1, 32, 4):
+            with T.block("p1_global.vtcm"):
+                v0, v1, v2, v3, v4, v5, v6 = T.axis.remap("SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6])
+                T.reads(p1[v0, v1, v2, v3, v4, v5, v6])
+                T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6])
+                p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6]
+        for po in T.serial(4):
+            for i in T.serial(55876):
+                with T.block("p0_global.vtcm"):
+                    v0 = T.axis.spatial(1, 0)
+                    v1 = T.axis.spatial(1, 0)
+                    v2 = T.axis.spatial(230, po * 56 + i // 916)
+                    v3 = T.axis.spatial(230, i % 916 // 4)
+                    v4 = T.axis.spatial(4, i % 4)
+                    T.reads(p0[v0, v1, v2, v3, v4])
+                    T.writes(p0_global_vtcm[v0, v1, v2, v3, v4])
+                    p0_global_vtcm[v0, v1, v2, v3, v4] = p0[v0, v1, v2, v3, v4]
+            for i in T.parallel(28):
+                for ii, iii, iiii in T.grid(2, 14, 8):
+                    with T.block("conv2d_NCHWc_int8_o_init"):
+                        n = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, ii)
+                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + iii)
+                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + iiii)
+                        oc_block_o = T.axis.spatial(1, 0)
+                        T.reads()
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
+                        for i4_1 in T.vectorized(32):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                oc_block_i_init = T.axis.spatial(32, i4_1)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                for i1_1, i5_1, i6_1, i2_2, i3_2 in T.grid(2, 7, 7, 14, 8):
+                    with T.block("conv2d_NCHWc_int8_o_update"):
+                        n = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, i1_1)
+                        oh = T.axis.spatial(112, (po * 28 + i) // 14 * 14 + i2_2)
+                        ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + i3_2)
+                        oc_block_o = T.axis.spatial(1, 0)
+                        kh = T.axis.reduce(7, i5_1)
+                        kw = T.axis.reduce(7, i6_1)
+                        ic_outer = T.axis.reduce(1, 0)
+                        ic_f_inner = T.axis.reduce(1, 0)
+                        ic_s_inner_o = T.axis.reduce(1, 0)
+                        T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], p0_global_vtcm[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4])
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
+                        A = T.match_buffer(p0_global_vtcm[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1, scope="global.vtcm")
+                        B = T.match_buffer(p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4], [32, 4], dtype="int8", offset_factor=1, scope="global.vtcm")
+                        C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], [32], dtype="int32", offset_factor=1, scope="global.vtcm")
+                        A_u8x4: T.uint8x4 = A[0:4]
+                        A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                        B_i8x128 = B[0, 0:128]
+                        B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                        C[0:32] = T.call_llvm_pure_intrin(4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32")
+            for i in T.serial(200704):
+                with T.block("conv2d_NCHWc_int8.vtcm"):
+                    ax0_1 = T.axis.spatial(1, 0)
+                    ax1_1 = T.axis.spatial(2, i % 7168 // 3584)
+                    ax2_1 = T.axis.spatial(112, (po * 28 + i // 7168) // 14 * 14 + i % 3584 // 256)
+                    ax3_1 = T.axis.spatial(112, (po * 28 + i // 7168) % 14 * 8 + i % 256 // 32)
+                    ax4 = T.axis.spatial(32, i % 32)
+                    T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                    T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]
+
+# from tvm.script import tir as T
+@tvm.script.ir_module
+class ModuleBase:
+    @T.prim_func
+    def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"]) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # buffer definition
+        # body
+        # with T.block("root")
+        conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32")
+        for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(112, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}):
+            for i4_0_0 in T.serial(1):
+                for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid(2, 1, 1, 1, 14, 8):
+                    with T.block("conv2d_NCHWc_int8_o_init"):
+                        n = T.axis.spatial(1, 0)
+                        oc_chunk = T.axis.spatial(2, i1_1_init + i1_2_init)
+                        oh = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init)
+                        ow = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init)
+                        oc_block_o = T.axis.spatial(1, 0)
+                        T.reads()
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
+                        for i4_1 in T.vectorized(32):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                oc_block_i_init = T.axis.spatial(32, i4_1)
+                                T.reads()
+                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(1, 1, 1, 1, 1):
+                    for i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_2, i1_2, i2_2, i3_2, i4_0_2 in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1):
+                        with T.block("conv2d_NCHWc_int8_o_update"):
+                            n = T.axis.spatial(1, 0)
+                            oc_chunk = T.axis.spatial(2, i1_1 + i1_2)
+                            oh = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2)
+                            ow = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2)
+                            oc_block_o = T.axis.spatial(1, 0)
+                            kh = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                            kw = T.axis.reduce(7, i6_0 * 7 + i6_1)
+                            ic_outer = T.axis.reduce(1, 0)
+                            ic_f_inner = T.axis.reduce(1, 0)
+                            ic_s_inner_o = T.axis.reduce(1, 0)
+                            T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], p0[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4])
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
+                            A = T.match_buffer(p0[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
+                            B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4], [32, 4], dtype="int8", offset_factor=1)
+                            C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], [32], dtype="int32", offset_factor=1)
+                            A_u8x4: T.uint8x4 = A[0:4]
+                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                            B_i8x128 = B[0, 0:128]
+                            B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                            C[0:32] = T.call_llvm_pure_intrin(4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32")
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8):
+                        for ax4_fused in T.vectorized(32):
+                            with T.block("T_cast_2"):
+                                ax0_1, ax1_1 = T.axis.remap("SS", [ax0, ax1])
+                                ax2_1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + ax2)
+                                ax3_1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3)
+                                ax4 = T.axis.spatial(32, ax4_fused)
+                                T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]
+    
+
+@tvm.testing.requires_hexagon
+def test_meta(hexagon_session):
+    sch = tvm.tir.Schedule(ModulePipelined)
+    a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
+    w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
+    c = np.zeros((1, 2, 112, 112, 32), dtype="int32")
+
+    outer_block = sch.get_block("conv2d_NCHWc_int8_o_update")
+    o = sch.get_loops(outer_block)[0]
+    
+    sch.annotate(o, "software_pipeline_stage", [0, 1, 2])
+    sch.annotate(o, "software_pipeline_order", [0, 1, 2])
+    sch.annotate(o, "software_pipeline_async_stages", [0, 2])
+    
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    with tvm.transform.PassContext(
+        config={
+            "tir.use_async_copy": True
+        }
+    ):
+        # print(tvm.lower(sch.mod).script())
+        func_tir = tvm.build(
+            sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+        )
+    module = hexagon_session.load_module(func_tir)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+
+    if tvm.testing.utils.IS_IN_CI:
+        # Run with reduced number and repeat for CI
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
+    else:
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    time = timer(a_hexagon, b_hexagon, c_hexagon)
+    print(time)
+
+
+@tvm.testing.requires_hexagon
+def test_meta_base(hexagon_session):
+    sch = tvm.tir.Schedule(ModuleBase)
+    a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
+    w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
+    c = np.zeros((1, 2, 112, 112, 32), dtype="int32")
+    
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    with tvm.transform.PassContext(
+        config={
+            "tir.use_async_copy": True
+        }
+    ):
+        # print(tvm.lower(sch.mod).script())
+        func_tir = tvm.build(
+            sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+        )
+    module = hexagon_session.load_module(func_tir)
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
+    b_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
+
+    if tvm.testing.utils.IS_IN_CI:
+        # Run with reduced number and repeat for CI
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
+    else:
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    time = timer(a_hexagon, b_hexagon, c_hexagon)
+    print(time)

From bdbedab00ce00877dc663926ae8e47279318c4bd Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Tue, 25 Oct 2022 12:24:01 -0700
Subject: [PATCH 4/8] Add tests for actual conv2d sourced from resnet50
 metascheduling.

---
 .../test_hexagon/test_async_dma_pipeline.py   | 109 ++++++------------
 1 file changed, 35 insertions(+), 74 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index c4b1489fa47c..eb6b82b2f121 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -75,8 +75,8 @@ def evaluate(
     sch,
     a,
     b,
-    size_a,
-    expected_output,
+    c,
+    expected_output=None,
     use_async_copy=0,
     merge_async_commit_queue_scope=False,
 ):
@@ -94,9 +94,7 @@ def evaluate(
 
     a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
     b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(
-        np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32"), device=hexagon_session.device
-    )
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
 
     if tvm.testing.utils.IS_IN_CI:
         # Run with reduced number and repeat for CI
@@ -105,10 +103,10 @@ def evaluate(
         timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10)
 
     time = timer(a_hexagon, b_hexagon, c_hexagon)
-    tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output)
+    if expected_output is not None:
+        tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output)
     return round(time.mean * 1000, 4)
 
-
 @tvm.testing.fixture
 def input_a(size_a):
     return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
@@ -306,13 +304,10 @@ class TestAsyncDMAPipeline:
     size_a = tvm.testing.parameter(
         1024,
         64 * 64,
-        64 * 128,
     )
 
     size_w = tvm.testing.parameter(
-        1 * 1,
         3 * 3,
-        7 * 7,
         9 * 9,
     )
 
@@ -331,11 +326,11 @@ def test_loading_vtcm_for_vrmpy(
             pytest.skip("Skipping test since it takes too long in CI.")
 
         sch = conv_approximation(size_a, size_w)
-        base_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output)
+        base_runtime = evaluate(hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output)
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
         base_vtcm_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
@@ -344,7 +339,7 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0])
         async_input_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
@@ -353,7 +348,7 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0, 2])
         async_input_output_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
         )
 
         sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w)
@@ -366,7 +361,7 @@ def test_loading_vtcm_for_vrmpy(
             sch,
             input_a,
             input_w,
-            size_a,
+            np.zeros(expected_output.shape, "int32"),
             expected_output,
             use_async_copy=1,
             merge_async_commit_queue_scope=False,
@@ -378,12 +373,12 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [2])
         async_output_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1
+            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
         )
 
         sch = get_single_dma_schedule(size_a, size_w)
         single_dma_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, size_a, expected_output
+            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output
         )
 
         # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128
@@ -547,69 +542,35 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1
 
 @tvm.testing.requires_hexagon
 def test_meta(hexagon_session):
-    sch = tvm.tir.Schedule(ModulePipelined)
     a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
     w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
     c = np.zeros((1, 2, 112, 112, 32), dtype="int32")
 
-    outer_block = sch.get_block("conv2d_NCHWc_int8_o_update")
-    o = sch.get_loops(outer_block)[0]
+    sch = tvm.tir.Schedule(ModuleBase)
+    base_runtime = evaluate(hexagon_session, sch, a, w, c)
+
+    sch = tvm.tir.Schedule(ModulePipelined)
+    compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
+    o = sch.get_loops(compute_block)[0]
     
+    vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+
+    sch = tvm.tir.Schedule(ModulePipelined)
+    compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
+    o = sch.get_loops(compute_block)[0]
+
     sch.annotate(o, "software_pipeline_stage", [0, 1, 2])
     sch.annotate(o, "software_pipeline_order", [0, 1, 2])
     sch.annotate(o, "software_pipeline_async_stages", [0, 2])
     
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
-    with tvm.transform.PassContext(
-        config={
-            "tir.use_async_copy": True
-        }
-    ):
-        # print(tvm.lower(sch.mod).script())
-        func_tir = tvm.build(
-            sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-        )
-    module = hexagon_session.load_module(func_tir)
-
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
-    b_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
-
-    if tvm.testing.utils.IS_IN_CI:
-        # Run with reduced number and repeat for CI
-        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
-    else:
-        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
-    time = timer(a_hexagon, b_hexagon, c_hexagon)
-    print(time)
-
-
-@tvm.testing.requires_hexagon
-def test_meta_base(hexagon_session):
-    sch = tvm.tir.Schedule(ModuleBase)
-    a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
-    w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
-    c = np.zeros((1, 2, 112, 112, 32), dtype="int32")
-    
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
-    with tvm.transform.PassContext(
-        config={
-            "tir.use_async_copy": True
-        }
-    ):
-        # print(tvm.lower(sch.mod).script())
-        func_tir = tvm.build(
-            sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
-        )
-    module = hexagon_session.load_module(func_tir)
-    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
-    b_hexagon = tvm.runtime.ndarray.array(w, device=hexagon_session.device)
-    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device)
-
-    if tvm.testing.utils.IS_IN_CI:
-        # Run with reduced number and repeat for CI
-        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
-    else:
-        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
-    time = timer(a_hexagon, b_hexagon, c_hexagon)
-    print(time)
+    pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+
+    transfer_mb = round((2 * a.size * VRMPY_SIZE_B + w.size * VRMPY_SIZE_B) / 1e6, 2)
+    print_results(
+        f"Test with A.size: {a.size}, W.size: {w.size}, and total memory transfer of {transfer_mb} MB...",
+        {
+            "without_vtcm": base_runtime,
+            "vtcm_runtime": vtcm_runtime,
+            "pipeline_runtime": pipeline_runtime,
+        },
+    )

From 9920d3c3931b857365481318242d615890945849 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Tue, 25 Oct 2022 13:30:13 -0700
Subject: [PATCH 5/8] [Hexagon] Change logs.

---
 .../python/contrib/test_hexagon/test_async_dma_pipeline.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index eb6b82b2f121..1f33e975ad69 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -553,7 +553,7 @@ def test_meta(hexagon_session):
     compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
     o = sch.get_loops(compute_block)[0]
     
-    vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
+    unscheduled_vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
 
     sch = tvm.tir.Schedule(ModulePipelined)
     compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
@@ -565,12 +565,12 @@ def test_meta(hexagon_session):
     
     pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
 
-    transfer_mb = round((2 * a.size * VRMPY_SIZE_B + w.size * VRMPY_SIZE_B) / 1e6, 2)
+    transfer_mb = round((a.size + w.size + c.size) / 1e6, 2)
     print_results(
         f"Test with A.size: {a.size}, W.size: {w.size}, and total memory transfer of {transfer_mb} MB...",
         {
             "without_vtcm": base_runtime,
-            "vtcm_runtime": vtcm_runtime,
+            "unscheduled_vtcm_runtime": unscheduled_vtcm_runtime,
             "pipeline_runtime": pipeline_runtime,
         },
     )

From 66127f47d793f8c1f02011a1ad7cb1c16a7ca1ce Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Tue, 25 Oct 2022 15:58:14 -0700
Subject: [PATCH 6/8] Add test to show effect of increased buffer sizes for
 pipelining.

---
 .../test_hexagon/test_async_dma_pipeline.py   | 256 +++++++++++++++---
 1 file changed, 219 insertions(+), 37 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 1f33e975ad69..2b2cb64b13b1 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -107,6 +107,7 @@ def evaluate(
         tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output)
     return round(time.mean * 1000, 4)
 
+
 @tvm.testing.fixture
 def input_a(size_a):
     return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
@@ -304,9 +305,11 @@ class TestAsyncDMAPipeline:
     size_a = tvm.testing.parameter(
         1024,
         64 * 64,
+        128 * 64,
     )
 
     size_w = tvm.testing.parameter(
+        1 * 1,
         3 * 3,
         9 * 9,
     )
@@ -326,11 +329,24 @@ def test_loading_vtcm_for_vrmpy(
             pytest.skip("Skipping test since it takes too long in CI.")
 
         sch = conv_approximation(size_a, size_w)
-        base_runtime = evaluate(hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output)
+        base_runtime = evaluate(
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+        )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
         base_vtcm_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
@@ -339,7 +355,13 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0])
         async_input_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
         )
 
         sch = get_fake_conv_vtcm_schedule(size_a, size_w)
@@ -348,7 +370,28 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [0, 2])
         async_input_output_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
+        )
+
+        sch = get_fake_conv_vtcm_schedule(size_a, size_w)
+        n = sch.get_loops(sch.get_block("C"))[0]
+        sch.annotate(n, "software_pipeline_stage", [0, 3, 6])
+        sch.annotate(n, "software_pipeline_order", [0, 1, 2])
+        sch.annotate(n, "software_pipeline_async_stages", [0, 6])
+        async_input_output_runtime_larger_buffers = evaluate(
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
         )
 
         sch = get_multi_input_fake_conv_vtcm_schedule(size_a, size_w)
@@ -373,12 +416,23 @@ def test_loading_vtcm_for_vrmpy(
         sch.annotate(n, "software_pipeline_order", [0, 1, 2])
         sch.annotate(n, "software_pipeline_async_stages", [2])
         async_output_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output, use_async_copy=1
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
+            use_async_copy=1,
         )
 
         sch = get_single_dma_schedule(size_a, size_w)
         single_dma_runtime = evaluate(
-            hexagon_session, sch, input_a, input_w, np.zeros(expected_output.shape, "int32"), expected_output
+            hexagon_session,
+            sch,
+            input_a,
+            input_w,
+            np.zeros(expected_output.shape, "int32"),
+            expected_output,
         )
 
         # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128
@@ -396,14 +450,20 @@ def test_loading_vtcm_for_vrmpy(
                 "async_dma_output": async_output_runtime,
                 "async_dma_input_output": async_input_output_runtime,
                 "async_dma_multi_input_output": async_multi_input_output_runtime,
+                "async_input_output_runtime_larger_buffers": async_input_output_runtime_larger_buffers,
             },
         )
 
+
 # from tvm.script import tir as T
 @tvm.script.ir_module
 class ModulePipelined:
     @T.prim_func
-    def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"]) -> None:
+    def main(
+        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+    ) -> None:
         # function attr dict
         T.func_attr({"tir.noalias": True, "global_symbol": "main"})
         # body
@@ -413,7 +473,9 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1
         p1_global_vtcm = T.alloc_buffer([2, 1, 7, 7, 1, 32, 4], dtype="int8", scope="global.vtcm")
         for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(2, 1, 7, 7, 1, 32, 4):
             with T.block("p1_global.vtcm"):
-                v0, v1, v2, v3, v4, v5, v6 = T.axis.remap("SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6])
+                v0, v1, v2, v3, v4, v5, v6 = T.axis.remap(
+                    "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]
+                )
                 T.reads(p1[v0, v1, v2, v3, v4, v5, v6])
                 T.writes(p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6])
                 p1_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = p1[v0, v1, v2, v3, v4, v5, v6]
@@ -437,7 +499,7 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1
                         ow = T.axis.spatial(112, (po * 28 + i) % 14 * 8 + iiii)
                         oc_block_o = T.axis.spatial(1, 0)
                         T.reads()
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
                         for i4_1 in T.vectorized(32):
                             with T.block("conv2d_NCHWc_int8_init"):
                                 oc_block_i_init = T.axis.spatial(32, i4_1)
@@ -456,16 +518,57 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1
                         ic_outer = T.axis.reduce(1, 0)
                         ic_f_inner = T.axis.reduce(1, 0)
                         ic_s_inner_o = T.axis.reduce(1, 0)
-                        T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], p0_global_vtcm[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4])
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
-                        A = T.match_buffer(p0_global_vtcm[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1, scope="global.vtcm")
-                        B = T.match_buffer(p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4], [32, 4], dtype="int8", offset_factor=1, scope="global.vtcm")
-                        C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], [32], dtype="int32", offset_factor=1, scope="global.vtcm")
+                        T.reads(
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            p0_global_vtcm[
+                                n,
+                                ic_outer,
+                                oh * 2 + kh,
+                                ow * 2 + kw,
+                                ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                            ],
+                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                        )
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                        A = T.match_buffer(
+                            p0_global_vtcm[
+                                n,
+                                ic_outer,
+                                oh * 2 + kh,
+                                ow * 2 + kw,
+                                ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                            ],
+                            [4],
+                            dtype="uint8",
+                            offset_factor=1,
+                            scope="global.vtcm",
+                        )
+                        B = T.match_buffer(
+                            p1_global_vtcm[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            [32, 4],
+                            dtype="int8",
+                            offset_factor=1,
+                            scope="global.vtcm",
+                        )
+                        C = T.match_buffer(
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                            [32],
+                            dtype="int32",
+                            offset_factor=1,
+                            scope="global.vtcm",
+                        )
                         A_u8x4: T.uint8x4 = A[0:4]
                         A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
                         B_i8x128 = B[0, 0:128]
                         B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-                        C[0:32] = T.call_llvm_pure_intrin(4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32")
+                        C[0:32] = T.call_llvm_pure_intrin(
+                            4217,
+                            T.uint32(3),
+                            C[0:32],
+                            T.broadcast(A_i32, 32),
+                            B_i32x32,
+                            dtype="int32x32",
+                        )
             for i in T.serial(200704):
                 with T.block("conv2d_NCHWc_int8.vtcm"):
                     ax0_1 = T.axis.spatial(1, 0)
@@ -475,30 +578,45 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1
                     ax4 = T.axis.spatial(32, i % 32)
                     T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
                     T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                    T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]
+                    T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                        ax0_1, ax1_1, ax2_1, ax3_1, ax4
+                    ]
+
 
 # from tvm.script import tir as T
 @tvm.script.ir_module
 class ModuleBase:
     @T.prim_func
-    def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"], T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"]) -> None:
+    def main(
+        p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"],
+        p1: T.Buffer[(2, 1, 7, 7, 1, 32, 4), "int8"],
+        T_cast: T.Buffer[(1, 2, 112, 112, 32), "int32"],
+    ) -> None:
         # function attr dict
         T.func_attr({"tir.noalias": True, "global_symbol": "main"})
         # buffer definition
         # body
         # with T.block("root")
         conv2d_NCHWc_int8 = T.alloc_buffer([1, 2, 112, 112, 32], dtype="int32")
-        for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(112, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}):
+        for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(
+            112, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}
+        ):
             for i4_0_0 in T.serial(1):
-                for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid(2, 1, 1, 1, 14, 8):
+                for i1_1_init, i2_1_init, i3_1_init, i1_2_init, i2_2_init, i3_2_init in T.grid(
+                    2, 1, 1, 1, 14, 8
+                ):
                     with T.block("conv2d_NCHWc_int8_o_init"):
                         n = T.axis.spatial(1, 0)
                         oc_chunk = T.axis.spatial(2, i1_1_init + i1_2_init)
-                        oh = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init)
-                        ow = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init)
+                        oh = T.axis.spatial(
+                            112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1_init * 14 + i2_2_init
+                        )
+                        ow = T.axis.spatial(
+                            112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1_init * 8 + i3_2_init
+                        )
                         oc_block_o = T.axis.spatial(1, 0)
                         T.reads()
-                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
                         for i4_1 in T.vectorized(32):
                             with T.block("conv2d_NCHWc_int8_init"):
                                 oc_block_i_init = T.axis.spatial(32, i4_1)
@@ -506,39 +624,103 @@ def main(p0: T.Buffer[(1, 1, 230, 230, 4), "uint8"], p1: T.Buffer[(2, 1, 7, 7, 1
                                 T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
                                 conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
                 for i5_0, i6_0, i7_0, i8_0, i9_0_0 in T.grid(1, 1, 1, 1, 1):
-                    for i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_2, i1_2, i2_2, i3_2, i4_0_2 in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1):
+                    for (
+                        i0_1,
+                        i1_1,
+                        i2_1,
+                        i3_1,
+                        i4_0_1,
+                        i5_1,
+                        i6_1,
+                        i7_1,
+                        i8_1,
+                        i9_0_1,
+                        i0_2,
+                        i1_2,
+                        i2_2,
+                        i3_2,
+                        i4_0_2,
+                    ) in T.grid(1, 2, 1, 1, 1, 7, 7, 1, 1, 1, 1, 1, 14, 8, 1):
                         with T.block("conv2d_NCHWc_int8_o_update"):
                             n = T.axis.spatial(1, 0)
                             oc_chunk = T.axis.spatial(2, i1_1 + i1_2)
-                            oh = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2)
-                            ow = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2)
+                            oh = T.axis.spatial(
+                                112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + i2_1 * 14 + i2_2
+                            )
+                            ow = T.axis.spatial(
+                                112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + i3_1 * 8 + i3_2
+                            )
                             oc_block_o = T.axis.spatial(1, 0)
                             kh = T.axis.reduce(7, i5_0 * 7 + i5_1)
                             kw = T.axis.reduce(7, i6_0 * 7 + i6_1)
                             ic_outer = T.axis.reduce(1, 0)
                             ic_f_inner = T.axis.reduce(1, 0)
                             ic_s_inner_o = T.axis.reduce(1, 0)
-                            T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], p0[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4])
-                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32])
-                            A = T.match_buffer(p0[n, ic_outer, oh * 2 + kh, ow * 2 + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
-                            B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 32, 0 : 4], [32, 4], dtype="int8", offset_factor=1)
-                            C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 32], [32], dtype="int32", offset_factor=1)
+                            T.reads(
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                                p0[
+                                    n,
+                                    ic_outer,
+                                    oh * 2 + kh,
+                                    ow * 2 + kw,
+                                    ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                                ],
+                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                            )
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32])
+                            A = T.match_buffer(
+                                p0[
+                                    n,
+                                    ic_outer,
+                                    oh * 2 + kh,
+                                    ow * 2 + kw,
+                                    ic_f_inner * 4 : ic_f_inner * 4 + 4,
+                                ],
+                                [4],
+                                dtype="uint8",
+                                offset_factor=1,
+                            )
+                            B = T.match_buffer(
+                                p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                                [32, 4],
+                                dtype="int8",
+                                offset_factor=1,
+                            )
+                            C = T.match_buffer(
+                                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:32],
+                                [32],
+                                dtype="int32",
+                                offset_factor=1,
+                            )
                             A_u8x4: T.uint8x4 = A[0:4]
                             A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
                             B_i8x128 = B[0, 0:128]
                             B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
-                            C[0:32] = T.call_llvm_pure_intrin(4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32")
+                            C[0:32] = T.call_llvm_pure_intrin(
+                                4217,
+                                T.uint32(3),
+                                C[0:32],
+                                T.broadcast(A_i32, 32),
+                                B_i32x32,
+                                dtype="int32x32",
+                            )
                     for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8):
                         for ax4_fused in T.vectorized(32):
                             with T.block("T_cast_2"):
                                 ax0_1, ax1_1 = T.axis.remap("SS", [ax0, ax1])
-                                ax2_1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + ax2)
-                                ax3_1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3)
+                                ax2_1 = T.axis.spatial(
+                                    112, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + ax2
+                                )
+                                ax3_1 = T.axis.spatial(
+                                    112, i0_0_i1_0_i2_0_i3_0_fused % 14 * 8 + ax3
+                                )
                                 ax4 = T.axis.spatial(32, ax4_fused)
                                 T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
                                 T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4]
-    
+                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = conv2d_NCHWc_int8[
+                                    ax0_1, ax1_1, ax2_1, ax3_1, ax4
+                                ]
+
 
 @tvm.testing.requires_hexagon
 def test_meta(hexagon_session):
@@ -552,7 +734,7 @@ def test_meta(hexagon_session):
     sch = tvm.tir.Schedule(ModulePipelined)
     compute_block = sch.get_block("conv2d_NCHWc_int8_o_update")
     o = sch.get_loops(compute_block)[0]
-    
+
     unscheduled_vtcm_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
 
     sch = tvm.tir.Schedule(ModulePipelined)
@@ -562,7 +744,7 @@ def test_meta(hexagon_session):
     sch.annotate(o, "software_pipeline_stage", [0, 1, 2])
     sch.annotate(o, "software_pipeline_order", [0, 1, 2])
     sch.annotate(o, "software_pipeline_async_stages", [0, 2])
-    
+
     pipeline_runtime = evaluate(hexagon_session, sch, a, w, c, use_async_copy=1)
 
     transfer_mb = round((a.size + w.size + c.size) / 1e6, 2)

From b47ceadcb56c70a45ac2d1e1b538162c83c4097e Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Wed, 26 Oct 2022 11:12:50 -0700
Subject: [PATCH 7/8] skip tests in CI.

---
 tests/python/contrib/test_hexagon/test_async_dma_pipeline.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 2b2cb64b13b1..7d4800f1f1ce 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -724,6 +724,9 @@ def main(
 
 @tvm.testing.requires_hexagon
 def test_meta(hexagon_session):
+    if tvm.testing.utils.IS_IN_CI:
+        pytest.skip("Skipping test since it takes too long in CI.")
+    
     a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
     w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
     c = np.zeros((1, 2, 112, 112, 32), dtype="int32")

From a0cfe023679e00a6ae6520b51f406e9d06c5207b Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@octoml.ai>
Date: Wed, 26 Oct 2022 11:18:58 -0700
Subject: [PATCH 8/8] lint

---
 tests/python/contrib/test_hexagon/test_async_dma_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 7d4800f1f1ce..45e8eb0f68c6 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -726,7 +726,7 @@ def main(
 def test_meta(hexagon_session):
     if tvm.testing.utils.IS_IN_CI:
         pytest.skip("Skipping test since it takes too long in CI.")
-    
+
     a = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
     w = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
     c = np.zeros((1, 2, 112, 112, 32), dtype="int32")