From 35ae9bcb8432269caf8f22564c1d3a650c711cb7 Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Thu, 6 Oct 2022 14:07:04 -0700 Subject: [PATCH 1/4] [Hexagon] Add tests to show how to properly utilize async dma pipelining on hexagon. --- .../test_hexagon/test_async_dma_pipeline.py | 353 ++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 tests/python/contrib/test_hexagon/test_async_dma_pipeline.py diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py new file mode 100644 index 000000000000..b39a0bf5e74d --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -0,0 +1,353 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Test different strategies for loading data into vtcm before running HVX workloads. """ + +import numpy as np +import tvm + +from tvm.script import tir as T +from numpy.random import default_rng + + +def conv_approximation(size_a, size_w): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [size_a, 128], dtype="uint8", align=128) + W = T.match_buffer(b, [size_w, 128], dtype="uint8", align=128) + C = T.match_buffer(c, [size_a, 32], dtype="int32", align=128) + for n, i in T.grid(size_a, size_w): + with T.block("C"): + vn, vi = T.axis.remap("SR", [n, i]) + T.reads(A[vn, 0:128], W[vi, 0:128], C[vn, 0:32]) + T.writes(C[vn, 0:32]) + with T.init(): + for x in T.serial(32): + C[vn, x] = 0 + C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"), + T.uint32(3), + C[vn, T.ramp(0, 1, 32)], + T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(W[vi, T.ramp(0, 1, 128)], dtype="int32x32"), + dtype="int32x32", + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.dma_wait", + 0, # QueueId + 0, # Wait for 0 in flight + dtype="int32", + ) + ) + + return tvm.tir.Schedule(operator) + + +def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy=0): + target_hexagon = tvm.target.hexagon("v68", link_params=True) + with tvm.transform.PassContext(config={"tir.use_async_copy": use_async_copy}): + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array( + np.zeros((size_a, 32), dtype="int32"), device=hexagon_session.device + ) + + if tvm.testing.utils.IS_IN_CI: + # These are reduced for CI + number = 1 + repeat = 1 + else: + number = 100 + repeat = 100 + + timer = module.time_evaluator( + "__tvm_main__", hexagon_session.device, number=number, repeat=repeat + ) + time = timer(a_hexagon, b_hexagon, c_hexagon) + tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output) + return round(time.mean * 1000, 4) + + +@tvm.testing.fixture +def input_a(size_a): + return default_rng().integers(0, 8, (size_a, 128), dtype="uint8") + + +@tvm.testing.fixture +def input_w(size_w): + return default_rng().integers(0, 8, (size_w, 128), dtype="uint8") + + +@tvm.testing.fixture +def expected_output(size_a, size_w, input_a, input_w): + expected_output = np.zeros((size_a, 32), dtype="int32") + for n in range(size_a): + for x in range(size_w): + for i in range(32): + for r in range(4): + expected_output[n, i] += np.uint32(input_a[n, i * 4 + r]) * np.uint32( + input_w[x, i * 4 + r] + ) + return expected_output + + +def get_single_dma_schedule(size_a, size_w): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [size_a, 128], dtype="uint8", align=128, mem_scope="global") + W = T.match_buffer(b, [size_w, 128], dtype="uint8", align=128, mem_scope="global") + C = T.match_buffer(c, [size_a, 32], dtype="int32", align=128, mem_scope="global") + A_global_vtcm = T.alloc_buffer( + [size_a, 128], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + W_global_vtcm = T.alloc_buffer( + [size_w, 128], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + C_global_vtcm = T.alloc_buffer( + [size_a, 32], dtype="int32", align=128, mem_scope="global.vtcm" + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + A_global_vtcm.data, + T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + 0, + 2, + A_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + A.data, + T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + 0, + 2, + A.dtype, + 0, + dtype="handle", + ), + T.cast(size_a, dtype="int") * 128, + dtype="int32", + ) + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + W_global_vtcm.data, + T.tvm_stack_make_shape(size_w, 128, dtype="handle"), + 0, + 2, + W_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + W.data, + T.tvm_stack_make_shape(size_w, 128, dtype="handle"), + 0, + 2, + W.dtype, + 0, + dtype="handle", + ), + T.cast(size_w, dtype="int") * 128, + dtype="int32", + ) + ) + for n, i in T.grid(size_a, size_w): + with T.block("C"): + vn, vi = T.axis.remap("SR", [n, i]) + T.reads(A_global_vtcm[vn, 0:128], W_global_vtcm[vi, 0:128], C_global_vtcm[vn, 0:32]) + T.writes(C_global_vtcm[vn, 0:32]) + with T.init(): + for x in T.serial(32): + C_global_vtcm[vn, x] = 0 + C_global_vtcm[vn, T.ramp(0, 1, 32)] += T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret(A_global_vtcm[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(W_global_vtcm[vi, T.ramp(0, 1, 128)], dtype="int32x32"), + dtype="int32x32", + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + C.data, + T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + 0, + 2, + C.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + C_global_vtcm.data, + T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + 0, + 2, + C_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.cast(size_a, dtype="int") * 128, + dtype="int32", + ) + ) + + sch = tvm.tir.Schedule(operator) + + return sch + + +def get_fake_conv_vtcm_schedule(size_a, size_w, blocks=2): + sch = conv_approximation(size_a, size_w) + + compute_block = sch.get_block("C") + sch.cache_read(compute_block, 1, "global.vtcm") + + n = sch.get_loops(compute_block)[0] + no, _ = sch.split(n, [blocks, None]) + + cache_read_block_a = sch.cache_read(compute_block, 0, "global.vtcm") + sch.compute_at(cache_read_block_a, no) + sch.fuse(*sch.get_loops(cache_read_block_a)[1:]) + + cache_read_block_c = sch.cache_write(compute_block, 0, "global.vtcm") + sch.reverse_compute_at(cache_read_block_c, no) + sch.fuse(*sch.get_loops(cache_read_block_c)[1:]) + + return sch + + +def print_results(test_key, runtimes): + print(test_key) + for runtime in runtimes.items(): + print("-{} took {} ms".format(runtime[0], runtime[1])) + print() + + +def print_excel_results(test_key, runtimes): + for runtime in runtimes.items(): + test_key += f"{runtime[1]}, " + print(test_key) + + +class TestMatMulVec: + # Removed most of these to speedup CI. + size_a = tvm.testing.parameter( + 1024, + 64 * 64, + 128 * 128, + ) + + size_w = tvm.testing.parameter( + 1 * 1, + 3 * 3, + 7 * 7, + 9 * 9, + ) + + @tvm.testing.requires_hexagon + def test_loading_vtcm_for_vrmpy( + self, + hexagon_session, + size_a, + size_w, + input_a, + input_w, + expected_output, + ): + + if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1): + print("skipping test due to ci") + return + + sch = conv_approximation(size_a, size_w) + base_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output) + + sch = get_fake_conv_vtcm_schedule(size_a, size_w) + base_vtcm_runtime = evaluate( + hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1 + ) + + sch = get_fake_conv_vtcm_schedule(size_a, size_w) + n = sch.get_loops(sch.get_block("C"))[0] + sch.annotate(n, "software_pipeline_stage", [0, 1, 2]) + sch.annotate(n, "software_pipeline_order", [0, 1, 2]) + sch.annotate(n, "software_pipeline_async_stages", [0]) + async_input_runtime = evaluate( + hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1 + ) + + sch = get_fake_conv_vtcm_schedule(size_a, size_w) + n = sch.get_loops(sch.get_block("C"))[0] + sch.annotate(n, "software_pipeline_stage", [0, 1, 2]) + sch.annotate(n, "software_pipeline_order", [0, 1, 2]) + sch.annotate(n, "software_pipeline_async_stages", [0, 2]) + async_input_output_runtime = evaluate( + hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1 + ) + + sch = get_fake_conv_vtcm_schedule(size_a, size_w) + n = sch.get_loops(sch.get_block("C"))[0] + sch.annotate(n, "software_pipeline_stage", [0, 1, 2]) + sch.annotate(n, "software_pipeline_order", [0, 1, 2]) + sch.annotate(n, "software_pipeline_async_stages", [2]) + async_output_runtime = evaluate( + hexagon_session, sch, input_a, input_w, size_a, expected_output, use_async_copy=1 + ) + + sch = get_single_dma_schedule(size_a, size_w) + single_dma_runtime = evaluate( + hexagon_session, sch, input_a, input_w, size_a, expected_output + ) + + transfer_mb = round((2 * size_a * 128 + size_w * 128) / 1e6, 2) + complexity = round(size_a * size_w * (128 * 4) / 1e9, 3) + # print_results( + # f"Test with A.size: {size_a * 128}, W.size: {size_w * 128}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...", + # { + # "without_vtcm": base_runtime, + # "synchronous_dma": single_dma_runtime, + # "base_vtcm": base_vtcm_runtime, + # "async_dma_input": async_input_runtime, + # "async_dma_output": async_output_runtime, + # "async_dma_input_output": async_input_output_runtime, + # }, + # ) + print_excel_results( + f"{size_a * 128}, {size_w * 128}, {complexity}, {transfer_mb}, ", + { + "without_vtcm": base_runtime, + "synchronous_dma": single_dma_runtime, + "base_vtcm": base_vtcm_runtime, + "async_dma_input": async_input_runtime, + "async_dma_output": async_output_runtime, + "async_dma_input_output": async_input_output_runtime, + }, + ) From 3f1fd1cda0ef620ee9ffa37b7c9bf9be0e521e7f Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Fri, 7 Oct 2022 08:56:09 -0700 Subject: [PATCH 2/4] Formatting updates. --- .../test_hexagon/test_async_dma_pipeline.py | 31 +++++-------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py index b39a0bf5e74d..ef38eaefff63 100644 --- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -81,6 +81,7 @@ def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy number = 100 repeat = 100 + timer = module.time_evaluator( "__tvm_main__", hexagon_session.device, number=number, repeat=repeat ) @@ -111,7 +112,6 @@ def expected_output(size_a, size_w, input_a, input_w): ) return expected_output - def get_single_dma_schedule(size_a, size_w): @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: @@ -251,12 +251,6 @@ def print_results(test_key, runtimes): print() -def print_excel_results(test_key, runtimes): - for runtime in runtimes.items(): - test_key += f"{runtime[1]}, " - print(test_key) - - class TestMatMulVec: # Removed most of these to speedup CI. size_a = tvm.testing.parameter( @@ -288,7 +282,9 @@ def test_loading_vtcm_for_vrmpy( return sch = conv_approximation(size_a, size_w) - base_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output) + base_runtime = evaluate( + hexagon_session, sch, input_a, input_w, size_a, expected_output + ) sch = get_fake_conv_vtcm_schedule(size_a, size_w) base_vtcm_runtime = evaluate( @@ -323,25 +319,12 @@ def test_loading_vtcm_for_vrmpy( ) sch = get_single_dma_schedule(size_a, size_w) - single_dma_runtime = evaluate( - hexagon_session, sch, input_a, input_w, size_a, expected_output - ) + single_dma_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output) transfer_mb = round((2 * size_a * 128 + size_w * 128) / 1e6, 2) complexity = round(size_a * size_w * (128 * 4) / 1e9, 3) - # print_results( - # f"Test with A.size: {size_a * 128}, W.size: {size_w * 128}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...", - # { - # "without_vtcm": base_runtime, - # "synchronous_dma": single_dma_runtime, - # "base_vtcm": base_vtcm_runtime, - # "async_dma_input": async_input_runtime, - # "async_dma_output": async_output_runtime, - # "async_dma_input_output": async_input_output_runtime, - # }, - # ) - print_excel_results( - f"{size_a * 128}, {size_w * 128}, {complexity}, {transfer_mb}, ", + print_results( + f"Test with A.size: {size_a * 128}, W.size: {size_w * 128}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...", { "without_vtcm": base_runtime, "synchronous_dma": single_dma_runtime, From 7a49450a4f5a831ffc2726dc9e961e2fe76dcec6 Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Mon, 10 Oct 2022 15:57:16 -0700 Subject: [PATCH 3/4] Update comments and reformatting. --- .../test_hexagon/test_async_dma_pipeline.py | 127 ++++++++++-------- 1 file changed, 71 insertions(+), 56 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py index ef38eaefff63..5aa838e6cbc6 100644 --- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -23,21 +23,30 @@ from tvm.script import tir as T from numpy.random import default_rng +from tvm.tir.function import TensorIntrin + +VRMPY_SIZE_B = 128 +VRMPY_SIZE_INT32 = 32 + def conv_approximation(size_a, size_w): + a_shape = (size_a, VRMPY_SIZE_B) + w_shape = (size_w, VRMPY_SIZE_B) + out_shape = (size_a, VRMPY_SIZE_INT32) + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [size_a, 128], dtype="uint8", align=128) - W = T.match_buffer(b, [size_w, 128], dtype="uint8", align=128) - C = T.match_buffer(c, [size_a, 32], dtype="int32", align=128) + A = T.match_buffer(a, a_shape, dtype="uint8") + W = T.match_buffer(b, w_shape, dtype="uint8") + C = T.match_buffer(c, out_shape, dtype="int32") for n, i in T.grid(size_a, size_w): with T.block("C"): vn, vi = T.axis.remap("SR", [n, i]) - T.reads(A[vn, 0:128], W[vi, 0:128], C[vn, 0:32]) - T.writes(C[vn, 0:32]) + T.reads(A[vn, 0:VRMPY_SIZE_B], W[vi, 0:VRMPY_SIZE_B], C[vn, 0:VRMPY_SIZE_INT32]) + T.writes(C[vn, 0:VRMPY_SIZE_INT32]) with T.init(): - for x in T.serial(32): + for x in T.serial(VRMPY_SIZE_INT32): C[vn, x] = 0 C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"), @@ -47,6 +56,9 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.reinterpret(W[vi, T.ramp(0, 1, 128)], dtype="int32x32"), dtype="int32x32", ) + # Currently async DMA lowering does not add any wait to the end of schedules so + # for timing purposes we are manually adding a wait to ensure that all copies + # are complete when the schedule exits. T.evaluate( T.tvm_call_packed( "device_api.hexagon.dma_wait", @@ -70,21 +82,15 @@ def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) c_hexagon = tvm.runtime.ndarray.array( - np.zeros((size_a, 32), dtype="int32"), device=hexagon_session.device + np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32"), device=hexagon_session.device ) if tvm.testing.utils.IS_IN_CI: - # These are reduced for CI - number = 1 - repeat = 1 + # Run with reduced number and repeat for CI + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1) else: - number = 100 - repeat = 100 + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10) - - timer = module.time_evaluator( - "__tvm_main__", hexagon_session.device, number=number, repeat=repeat - ) time = timer(a_hexagon, b_hexagon, c_hexagon) tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output) return round(time.mean * 1000, 4) @@ -92,48 +98,50 @@ def evaluate(hexagon_session, sch, a, b, size_a, expected_output, use_async_copy @tvm.testing.fixture def input_a(size_a): - return default_rng().integers(0, 8, (size_a, 128), dtype="uint8") + return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8") @tvm.testing.fixture def input_w(size_w): - return default_rng().integers(0, 8, (size_w, 128), dtype="uint8") + return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8") @tvm.testing.fixture def expected_output(size_a, size_w, input_a, input_w): - expected_output = np.zeros((size_a, 32), dtype="int32") + expected_output = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32") for n in range(size_a): for x in range(size_w): - for i in range(32): + for i in range(VRMPY_SIZE_INT32): for r in range(4): expected_output[n, i] += np.uint32(input_a[n, i * 4 + r]) * np.uint32( input_w[x, i * 4 + r] ) return expected_output + def get_single_dma_schedule(size_a, size_w): + a_shape = (size_a, VRMPY_SIZE_B) + w_shape = (size_w, VRMPY_SIZE_B) + out_shape = (size_a, VRMPY_SIZE_INT32) + + a_bytes = size_a * VRMPY_SIZE_B + w_bytes = size_w * VRMPY_SIZE_B + @T.prim_func def operator(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - A = T.match_buffer(a, [size_a, 128], dtype="uint8", align=128, mem_scope="global") - W = T.match_buffer(b, [size_w, 128], dtype="uint8", align=128, mem_scope="global") - C = T.match_buffer(c, [size_a, 32], dtype="int32", align=128, mem_scope="global") - A_global_vtcm = T.alloc_buffer( - [size_a, 128], dtype="uint8", align=128, mem_scope="global.vtcm" - ) - W_global_vtcm = T.alloc_buffer( - [size_w, 128], dtype="uint8", align=128, mem_scope="global.vtcm" - ) - C_global_vtcm = T.alloc_buffer( - [size_a, 32], dtype="int32", align=128, mem_scope="global.vtcm" - ) + A = T.match_buffer(a, a_shape, dtype="uint8", mem_scope="global") + W = T.match_buffer(b, w_shape, dtype="uint8", mem_scope="global") + C = T.match_buffer(c, out_shape, dtype="int32", mem_scope="global") + A_global_vtcm = T.alloc_buffer(a_shape, dtype="uint8", mem_scope="global.vtcm") + W_global_vtcm = T.alloc_buffer(w_shape, dtype="uint8", mem_scope="global.vtcm") + C_global_vtcm = T.alloc_buffer(out_shape, dtype="int32", mem_scope="global.vtcm") T.evaluate( T.tvm_call_packed( "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( A_global_vtcm.data, - T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, A_global_vtcm.dtype, @@ -142,14 +150,14 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: ), T.tvm_stack_make_array( A.data, - T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, A.dtype, 0, dtype="handle", ), - T.cast(size_a, dtype="int") * 128, + T.cast(a_bytes, dtype="int"), dtype="int32", ) ) @@ -158,7 +166,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( W_global_vtcm.data, - T.tvm_stack_make_shape(size_w, 128, dtype="handle"), + T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"), 0, 2, W_global_vtcm.dtype, @@ -167,24 +175,28 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: ), T.tvm_stack_make_array( W.data, - T.tvm_stack_make_shape(size_w, 128, dtype="handle"), + T.tvm_stack_make_shape(size_w, VRMPY_SIZE_B, dtype="handle"), 0, 2, W.dtype, 0, dtype="handle", ), - T.cast(size_w, dtype="int") * 128, + T.cast(w_bytes, dtype="int"), dtype="int32", ) ) for n, i in T.grid(size_a, size_w): with T.block("C"): vn, vi = T.axis.remap("SR", [n, i]) - T.reads(A_global_vtcm[vn, 0:128], W_global_vtcm[vi, 0:128], C_global_vtcm[vn, 0:32]) - T.writes(C_global_vtcm[vn, 0:32]) + T.reads( + A_global_vtcm[vn, 0:VRMPY_SIZE_B], + W_global_vtcm[vi, 0:VRMPY_SIZE_B], + C_global_vtcm[vn, 0:VRMPY_SIZE_INT32], + ) + T.writes(C_global_vtcm[vn, 0:VRMPY_SIZE_INT32]) with T.init(): - for x in T.serial(32): + for x in T.serial(VRMPY_SIZE_INT32): C_global_vtcm[vn, x] = 0 C_global_vtcm[vn, T.ramp(0, 1, 32)] += T.call_llvm_intrin( T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), @@ -198,7 +210,7 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: "device_api.hexagon.mem_copy_DLTensor", T.tvm_stack_make_array( C.data, - T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, C.dtype, @@ -207,14 +219,14 @@ def operator(a: T.handle, b: T.handle, c: T.handle) -> None: ), T.tvm_stack_make_array( C_global_vtcm.data, - T.tvm_stack_make_shape(size_a, 128, dtype="handle"), + T.tvm_stack_make_shape(size_a, VRMPY_SIZE_B, dtype="handle"), 0, 2, C_global_vtcm.dtype, 0, dtype="handle", ), - T.cast(size_a, dtype="int") * 128, + T.cast(a_bytes, dtype="int"), dtype="int32", ) ) @@ -251,7 +263,7 @@ def print_results(test_key, runtimes): print() -class TestMatMulVec: +class TestAsyncDMAPipeline: # Removed most of these to speedup CI. size_a = tvm.testing.parameter( 1024, @@ -260,10 +272,10 @@ class TestMatMulVec: ) size_w = tvm.testing.parameter( - 1 * 1, - 3 * 3, - 7 * 7, - 9 * 9, + 1 * 1 * 32, + 3 * 3 * 32, + 7 * 7 * 32, + 9 * 9 * 32, ) @tvm.testing.requires_hexagon @@ -282,9 +294,7 @@ def test_loading_vtcm_for_vrmpy( return sch = conv_approximation(size_a, size_w) - base_runtime = evaluate( - hexagon_session, sch, input_a, input_w, size_a, expected_output - ) + base_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output) sch = get_fake_conv_vtcm_schedule(size_a, size_w) base_vtcm_runtime = evaluate( @@ -319,12 +329,17 @@ def test_loading_vtcm_for_vrmpy( ) sch = get_single_dma_schedule(size_a, size_w) - single_dma_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output) + single_dma_runtime = evaluate( + hexagon_session, sch, input_a, input_w, size_a, expected_output + ) + + # Total transfer size is equal to the size of A + W + C which is equal to 2 * size_a * 128 + size_w * 128 + transfer_mb = round((2 * size_a * VRMPY_SIZE_B + size_w * VRMPY_SIZE_B) / 1e6, 2) - transfer_mb = round((2 * size_a * 128 + size_w * 128) / 1e6, 2) - complexity = round(size_a * size_w * (128 * 4) / 1e9, 3) + # Total number of operations can be calculated given the total number of vrmpy calls (size_a * size_w) * operations per vrmpy accumulate (128 multiplies + 3 adds for reduction per lane + 1 add for accumulate per lane) + complexity = round(size_a * size_w * (VRMPY_SIZE_B * 4) / 1e9, 3) print_results( - f"Test with A.size: {size_a * 128}, W.size: {size_w * 128}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...", + f"Test with A.size: {size_a * VRMPY_SIZE_B}, W.size: {size_w * VRMPY_SIZE_B}, computational complexity of {complexity} GOPs, and total memory transfer of {transfer_mb} MB...", { "without_vtcm": base_runtime, "synchronous_dma": single_dma_runtime, From a547bd6efe3f1cc1c5b6070f59ea0e71ccaacea9 Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Tue, 11 Oct 2022 09:16:25 -0700 Subject: [PATCH 4/4] Skip long tests in CI. --- .../test_hexagon/test_async_dma_pipeline.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py index 5aa838e6cbc6..d05e0a6e9216 100644 --- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py +++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py @@ -19,6 +19,7 @@ import numpy as np import tvm +import pytest from tvm.script import tir as T from numpy.random import default_rng @@ -108,6 +109,8 @@ def input_w(size_w): @tvm.testing.fixture def expected_output(size_a, size_w, input_a, input_w): + if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1): + pytest.skip("Skipping test since it takes too long in CI.") expected_output = np.zeros((size_a, VRMPY_SIZE_INT32), dtype="int32") for n in range(size_a): for x in range(size_w): @@ -272,10 +275,10 @@ class TestAsyncDMAPipeline: ) size_w = tvm.testing.parameter( - 1 * 1 * 32, - 3 * 3 * 32, - 7 * 7 * 32, - 9 * 9 * 32, + 1 * 1, + 3 * 3, + 7 * 7, + 9 * 9, ) @tvm.testing.requires_hexagon @@ -290,8 +293,7 @@ def test_loading_vtcm_for_vrmpy( ): if tvm.testing.utils.IS_IN_CI and (size_a > 1024 or size_w > 1): - print("skipping test due to ci") - return + pytest.skip("Skipping test since it takes too long in CI.") sch = conv_approximation(size_a, size_w) base_runtime = evaluate(hexagon_session, sch, input_a, input_w, size_a, expected_output)