diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 005eae68b8b7..f3d0c1af0a85 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -127,8 +127,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): name="conv2d_hwcn.generic", ) elif layout == "NHWC": - channels = data.shape[3] - if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout == "HWOI": + if "SMLAD" in isa and kernel_layout == "HWOI": strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd), diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py index 988c3a99c059..307312076a7e 100644 --- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py +++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py @@ -112,10 +112,14 @@ def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, ou cfg.reduce_axis(in_channels.value), ) - assert in_channels.value % 4 == 0 owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2) cio, cii = cfg.define_split( - "tile_ci", ci, policy="factors", num_outputs=2, filter=lambda x: x.size[-1] % 4 == 0 + "tile_ci", + ci, + policy="factors", + num_outputs=2, + # TODO: check case with in_channels.value % 4 != 0 with AutoTVM + filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0, ) coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2) @@ -134,6 +138,11 @@ def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, ou cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) cfg.define_knob("unroll_explicit", [0, 1]) + if cfg.is_fallback: + cfg.fallback_split("tile_ow", [-1, out_width.value]) + cfg.fallback_split("tile_ci", [-1, in_channels.value]) + cfg.fallback_split("tile_co", [-1, out_channels.value]) + return conv diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py index fb6f7a589525..9a00fe272087 100644 --- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py +++ b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py @@ -46,9 +46,8 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): K = K.value if isinstance(N, tvm.tir.IntImm): N = N.value - assert K % 4 == 0 # TODO(weberlo, areusch): support more dtypes? - assert in_dtype == "int8" + assert in_dtype in ("int8", "int16") assert out_dtype == "int32" A = te.placeholder((M, K), name="a", dtype=in_dtype) B = te.placeholder((N, K), name="b", dtype=in_dtype) @@ -71,13 +70,14 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): def intrin_func(ins, outs): aa, bb = ins cc = outs[0] + gemm_func_prefix = "gemm" if in_dtype == "int8" else "gemm16" def _reduce_update(): ib = tvm.tir.ir_builder.create() ib.emit( tvm.tir.call_extern( "int32", - f"gemm_{M}x{K}x{N}_update_{uniq_id}", + f"{gemm_func_prefix}_{M}x{K}x{N}_update_{uniq_id}", aa.access_ptr("r"), bb.access_ptr("r"), cc.access_ptr("w"), @@ -102,7 +102,7 @@ def _body(): ib.emit( tvm.tir.call_extern( "int32", - f"gemm_{M}x{K}x{N}_body_{uniq_id}", + f"{gemm_func_prefix}_{M}x{K}x{N}_body_{uniq_id}", aa.access_ptr("r"), bb.access_ptr("r"), cc.access_ptr("w"), @@ -122,7 +122,7 @@ def _body(): def gemm_MxKxN_impl(M, K, N, uniq_id): """Emit C code for gemm impl.""" # TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays quickly? - aa_pad_size = M * K + # aa_pad_size = M * K bb_pad_size = N * K # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601) cc_code = f""" @@ -132,32 +132,270 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): #include #include +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_body_rest_{uniq_id}( + int K, + int8_t *aa, int8_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + int k_base = (K / 4) * 4; + switch ( K % 4 ) {{ + case 1: + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int8_t *a_ptr = &aa[i * A_stride + k_base]; + int8_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0]; + }} + }} + break; + case 2: + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int8_t *a_ptr = &aa[i * A_stride + k_base]; + int8_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0] + + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]; + }} + }} + break; + case 3: + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int8_t *a_ptr = &aa[i * A_stride + k_base]; + int8_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0] + + (int32_t) a_ptr[1] * (int32_t) b_ptr[1] + + (int32_t) a_ptr[2] * (int32_t) b_ptr[2]; + }} + }} + break; + }} + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}( + int8_t *aa, int8_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int32_t sum = 0; + for (int l = 0; l < {K}; l++) {{ + sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l]; + }} + // NOTE: this is the line where `*_body` differs from `*_update`. here + // we're *setting* the result, instead of accumulating, because we know + // the `i` and `j` itervars span their entire respective axes. + cc[i*C_stride + j] = sum; + }} + }} + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif __STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}( int8_t *aa, int8_t *bb, int32_t *cc, int A_stride, int B_stride, int C_stride) {{ - int16_t aa_pad[{aa_pad_size}]; int16_t bb_pad[{bb_pad_size}]; + if ( {M} < 16 || {N} < 16 ) + return gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + + for (int i = 0; i < {N}; i++) + for (int j = 0; j < {K} / 4; j++) + read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]); + for (int i = 0; i < {M}; i++) {{ - for (int j = 0; j < {K} / 4; j++) {{ - read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]); + int16_t aa_pad_line[{K}]; + for (int l = 0; l < {K} / 4; l++) + read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4], (int32_t*) &aa_pad_line[l*4 + 2]); + + for (int j = 0; j < {N}; j++) {{ + int32_t *aa_ptr = (int32_t *) aa_pad_line; + int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}]; + int32_t sum = 0; + for (int l = 0; l < 2 * ({K} / 4); l++) {{ + sum = __SMLAD(*aa_ptr, *bb_ptr, sum); + ++ aa_ptr; ++ bb_ptr; + }} + // NOTE: this is the line where `*_body` differs from `*_update`. here + // we're *setting* the result, instead of accumulating, because we know + // the `i` and `j` itervars span their entire respective axes. + cc[i*C_stride + j] = sum; }} }} - for (int i = 0; i < {N}; i++) {{ - for (int j = 0; j < {K} / 4; j++) {{ + if ( {K} % 4 != 0 ) + gemm_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); + + return 0; +}} + + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_update_rest_{uniq_id}( + int K, + int8_t *aa, int8_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + int k_base = (K / 4) * 4; + switch ( K % 4 ) {{ + case 1: + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int8_t *a_ptr = &aa[i * A_stride + k_base]; + int8_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0]; + }} + }} + break; + case 2: + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int8_t *a_ptr = &aa[i * A_stride + k_base]; + int8_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0] + + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]; + }} + }} + break; + case 3: + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int8_t *a_ptr = &aa[i * A_stride + k_base]; + int8_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0] + + (int32_t) a_ptr[1] * (int32_t) b_ptr[1] + + (int32_t) a_ptr[2] * (int32_t) b_ptr[2]; + }} + }} + break; + }} + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}( + int8_t *aa, int8_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int32_t sum = 0; + for (int l = 0; l < {K}; l++) {{ + sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l]; + }} + cc[i*C_stride + j] += sum; + }} + }} + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}( + int8_t *aa, int8_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + int16_t bb_pad[{bb_pad_size}]; + + if ( {M} < 16 || {N} < 16 ) + return gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + + for (int i = 0; i < {N}; i++) + for (int j = 0; j < {K} / 4; j++) read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]); + + for (int i = 0; i < {M}; i++) {{ + int16_t aa_pad_line[{K}]; + for (int l = 0; l < {K} / 4; l++) + read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4], (int32_t*) &aa_pad_line[l*4 + 2]); + + for (int j = 0; j < {N}; j++) {{ + int32_t *aa_ptr = (int32_t *) aa_pad_line; + int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}]; + int32_t sum = 0; + for (int l = 0; l < 2 * ({K} / 4); l++) {{ + sum = __SMLAD(*aa_ptr, *bb_ptr, sum); + ++ aa_ptr; ++ bb_ptr; + }} + cc[i*C_stride + j] += sum; + }} + }} + + if ( {K} % 4 != 0 ) + gemm_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); + + return 0; +}} + + + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_body_rest_{uniq_id}( + int K, + int16_t *aa, int16_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + int k_base = (K / 2) * 2; + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int16_t *a_ptr = &aa[i * A_stride + k_base]; + int16_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0]; + }} + }} + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}( + int16_t *aa, int16_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int32_t sum = 0; + for (int l = 0; l < {K}; l++) {{ + sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l]; + }} + // NOTE: this is the line where `*_body` differs from `*_update`. here + // we're *setting* the result, instead of accumulating, because we know + // the `i` and `j` itervars span their entire respective axes. + cc[i*C_stride + j] = sum; }} }} + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}( + int16_t *aa, int16_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + if ( {M} < 2 || {N} < 2 ) + return gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); for (int i = 0; i < {M}; i++) {{ for (int j = 0; j < {N}; j++) {{ + int32_t *aa_ptr = (int32_t *) &aa[i*A_stride]; + int32_t *bb_ptr = (int32_t *) &bb[j*B_stride]; + int32_t sum = 0; for (int l = 0; l < {K} / 2; l++) {{ - sum = __SMLAD( - *((int32_t*) &aa_pad[i*{K} + l*2]), - *((int32_t*) &bb_pad[j*{K} + l*2]), - sum); + sum = __SMLAD(*aa_ptr, *bb_ptr, sum); + ++ aa_ptr; ++ bb_ptr; }} // NOTE: this is the line where `*_body` differs from `*_update`. here // we're *setting* the result, instead of accumulating, because we know @@ -166,46 +404,80 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): }} }} + if ( {K} % 2 != 0 ) + gemm16_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); + return 0; }} + #ifdef __cplusplus extern "C" #endif -__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}( - int8_t *aa, int8_t *bb, int32_t *cc, +__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_update_rest_{uniq_id}( + int K, + int16_t *aa, int16_t *bb, int32_t *cc, int A_stride, int B_stride, int C_stride) {{ - int16_t aa_pad[{aa_pad_size}]; - int16_t bb_pad[{bb_pad_size}]; - + int k_base = (K / 2) * 2; for (int i = 0; i < {M}; i++) {{ - for (int j = 0; j < {K} / 4; j++) {{ - read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4], (int32_t*) &aa_pad[i*{K} + j*4 + 2]); + for (int j = 0; j < {N}; j++) {{ + int16_t *a_ptr = &aa[i * A_stride + k_base]; + int16_t *b_ptr = &bb[j * B_stride + k_base]; + cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0]; }} }} + return 0; +}} - for (int i = 0; i < {N}; i++) {{ - for (int j = 0; j < {K} / 4; j++) {{ - read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4], (int32_t*) &bb_pad[i*{K} + j*4 + 2]); +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}( + int16_t *aa, int16_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + for (int i = 0; i < {M}; i++) {{ + for (int j = 0; j < {N}; j++) {{ + int32_t sum = 0; + for (int l = 0; l < {K}; l++) {{ + sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l]; + }} + cc[i*C_stride + j] += sum; }} }} + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}( + int16_t *aa, int16_t *bb, int32_t *cc, + int A_stride, int B_stride, int C_stride) {{ + if ( {M} < 2 || {N} < 2 ) + return gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); for (int i = 0; i < {M}; i++) {{ for (int j = 0; j < {N}; j++) {{ + int32_t *aa_ptr = (int32_t *) &aa[i*A_stride]; + int32_t *bb_ptr = (int32_t *) &bb[j*B_stride]; + int32_t sum = 0; for (int l = 0; l < {K} / 2; l++) {{ - sum = __SMLAD( - *((int32_t*) &aa_pad[i*{K} + l*2]), - *((int32_t*) &bb_pad[j*{K} + l*2]), - sum); + sum = __SMLAD(*aa_ptr, *bb_ptr, sum); + ++ aa_ptr; ++ bb_ptr; }} cc[i*C_stride + j] += sum; }} }} + if ( {K} % 2 != 0 ) + gemm16_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); + return 0; }} + + #ifdef __cplusplus extern "C" #endif diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py index 54c3de252f8a..c27c869509d7 100644 --- a/tests/micro/zephyr/test_utils.py +++ b/tests/micro/zephyr/test_utils.py @@ -14,8 +14,21 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import io +import os import json import pathlib +import logging +import tarfile + +import numpy as np + +from urllib.request import urlopen, urlretrieve +from urllib.error import HTTPError +import json +import requests + +import tvm.micro TEMPLATE_PROJECT_DIR = ( @@ -60,3 +73,119 @@ def has_fpu(board: str): fpu_boards = [name for name, board in board_properties.items() if board["fpu"]] return board in fpu_boards + + +def build_project(temp_dir, zephyr_board, west_cmd, mod, build_config, extra_files_tar=None): + project_dir = temp_dir / "project" + project = tvm.micro.generate_project( + str(TEMPLATE_PROJECT_DIR), + mod, + project_dir, + { + "extra_files_tar": extra_files_tar, + "project_type": "aot_demo", + "west_cmd": west_cmd, + "verbose": bool(build_config.get("debug")), + "zephyr_board": zephyr_board, + }, + ) + project.build() + return project, project_dir + + +def create_header_file(tensor_name, npy_data, output_path, tar_file): + """ + This method generates a header file containing the data contained in the numpy array provided. + It is used to capture the tensor data (for both inputs and expected outputs). + """ + header_file = io.StringIO() + header_file.write("#include \n") + header_file.write("#include \n") + header_file.write("#include \n") + header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n") + + if npy_data.dtype == "int8": + header_file.write(f"int8_t {tensor_name}[] =") + elif npy_data.dtype == "int32": + header_file.write(f"int32_t {tensor_name}[] = ") + elif npy_data.dtype == "uint8": + header_file.write(f"uint8_t {tensor_name}[] = ") + elif npy_data.dtype == "float32": + header_file.write(f"float {tensor_name}[] = ") + else: + raise ValueError("Data type not expected.") + + header_file.write("{") + for i in np.ndindex(npy_data.shape): + header_file.write(f"{npy_data[i]}, ") + header_file.write("};\n\n") + + header_file_bytes = bytes(header_file.getvalue(), "utf-8") + raw_path = pathlib.Path(output_path) / f"{tensor_name}.h" + ti = tarfile.TarInfo(name=str(raw_path)) + ti.size = len(header_file_bytes) + ti.mode = 0o644 + ti.type = tarfile.REGTYPE + tar_file.addfile(ti, io.BytesIO(header_file_bytes)) + + +def _read_line(fd, timeout_sec: int): + data = "" + new_line = False + while True: + if new_line: + break + new_data = fd.read(1, timeout_sec=timeout_sec) + logging.debug(f"read data: {new_data}") + for item in new_data: + new_c = chr(item) + data = data + new_c + if new_c == "\n": + new_line = True + break + return data + + +def get_message(fd, expr: str, timeout_sec: int): + while True: + data = _read_line(fd, timeout_sec) + logging.debug(f"new line: {data}") + if expr in data: + return data + + +# TODO move CMSIS integration to microtvm_api_server.py +# see https://discuss.tvm.apache.org/t/tvm-capturing-dependent-libraries-of-code-generated-tir-initially-for-use-in-model-library-format/11080 +def loadCMSIS(temp_dir): + REPO_PATH = "ARM-software/CMSIS_5" + BRANCH = "master" + API_PATH_URL = f"https://api.github.com/repos/{REPO_PATH}/git/trees" + RAW_PATH_URL = f"https://raw.githubusercontent.com/{REPO_PATH}/{BRANCH}" + + url = "https://api.github.com/repos/ARM-software/CMSIS_5/git/trees/master?recursive=1" + r = requests.get(url) + res = r.json() + + include_trees = {} + + for file in res["tree"]: + if file["path"] in {"CMSIS/DSP/Include", "CMSIS/DSP/Include/dsp", "CMSIS/NN/Include"}: + include_trees.update({file["path"]: file["sha"]}) + + for path, sha in include_trees.items(): + url = f"{API_PATH_URL}/{sha}" + content = json.load(urlopen(url)) + temp_path = f"{temp_dir}" + if path == "CMSIS/DSP/Include/dsp": + temp_path = f"{temp_dir}/dsp" + if not os.path.isdir(temp_path): + os.makedirs(temp_path) + for item in content["tree"]: + if item["type"] == "blob": + file_name = item["path"] + file_url = f"{RAW_PATH_URL}/{path}/{file_name}" + print(file_name, " ", file_url) + try: + urlretrieve(file_url, f"{temp_path}/{file_name}") + except HTTPError as e: + print(f"Failed to download {file_url}: {e}") diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py index f03b8ecce6d0..a8a7a99a34dd 100644 --- a/tests/micro/zephyr/test_zephyr_aot.py +++ b/tests/micro/zephyr/test_zephyr_aot.py @@ -27,6 +27,7 @@ import numpy as np import tvm +import tvm.testing from tvm.micro.project_api import server import tvm.relay as relay @@ -36,85 +37,6 @@ import test_utils -def _build_project(temp_dir, zephyr_board, west_cmd, mod, build_config, extra_files_tar=None): - project_dir = temp_dir / "project" - project = tvm.micro.generate_project( - str(test_utils.TEMPLATE_PROJECT_DIR), - mod, - project_dir, - { - "extra_files_tar": extra_files_tar, - "project_type": "aot_demo", - "west_cmd": west_cmd, - "verbose": bool(build_config.get("debug")), - "zephyr_board": zephyr_board, - }, - ) - project.build() - return project, project_dir - - -def _create_header_file(tensor_name, npy_data, output_path, tar_file): - """ - This method generates a header file containing the data contained in the numpy array provided. - It is used to capture the tensor data (for both inputs and expected outputs). - """ - header_file = io.StringIO() - header_file.write("#include \n") - header_file.write("#include \n") - header_file.write("#include \n") - header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n") - - if npy_data.dtype == "int8": - header_file.write(f"int8_t {tensor_name}[] =") - elif npy_data.dtype == "int32": - header_file.write(f"int32_t {tensor_name}[] = ") - elif npy_data.dtype == "uint8": - header_file.write(f"uint8_t {tensor_name}[] = ") - elif npy_data.dtype == "float32": - header_file.write(f"float {tensor_name}[] = ") - else: - raise ValueError("Data type not expected.") - - header_file.write("{") - for i in np.ndindex(npy_data.shape): - header_file.write(f"{npy_data[i]}, ") - header_file.write("};\n\n") - - header_file_bytes = bytes(header_file.getvalue(), "utf-8") - raw_path = pathlib.Path(output_path) / f"{tensor_name}.h" - ti = tarfile.TarInfo(name=str(raw_path)) - ti.size = len(header_file_bytes) - ti.mode = 0o644 - ti.type = tarfile.REGTYPE - tar_file.addfile(ti, io.BytesIO(header_file_bytes)) - - -def _read_line(fd, timeout_sec: int): - data = "" - new_line = False - while True: - if new_line: - break - new_data = fd.read(1, timeout_sec=timeout_sec) - logging.debug(f"read data: {new_data}") - for item in new_data: - new_c = chr(item) - data = data + new_c - if new_c == "\n": - new_line = True - break - return data - - -def _get_message(fd, expr: str, timeout_sec: int): - while True: - data = _read_line(fd, timeout_sec) - logging.debug(f"new line: {data}") - if expr in data: - return data - - @tvm.testing.requires_micro def test_tflite(temp_dir, board, west_cmd, tvm_debug): """Testing a TFLite model.""" @@ -175,12 +97,12 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug): ) tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir)) - _create_header_file("input_data", sample, "include", tf) - _create_header_file( + test_utils.create_header_file("input_data", sample, "include", tf) + test_utils.create_header_file( "output_data", np.zeros(shape=output_shape, dtype="float32"), "include", tf ) - project, _ = _build_project( + project, _ = test_utils.build_project( temp_dir, board, west_cmd, @@ -192,9 +114,9 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug): project.flash() with project.transport() as transport: timeout_read = 60 - _get_message(transport, "#wakeup", timeout_sec=timeout_read) + test_utils.get_message(transport, "#wakeup", timeout_sec=timeout_read) transport.write(b"start\n", timeout_sec=5) - result_line = _get_message(transport, "#result", timeout_sec=timeout_read) + result_line = test_utils.get_message(transport, "#result", timeout_sec=timeout_read) result_line = result_line.strip("\n") result_line = result_line.split(":") @@ -236,10 +158,14 @@ def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug): lowered.libmod_name, ["input_1"], ["output"], model_files_path ) tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir)) - _create_header_file("input_data", np.zeros(shape=shape, dtype=dtype), "include", tf) - _create_header_file("output_data", np.zeros(shape=shape, dtype=dtype), "include", tf) + test_utils.create_header_file( + "input_data", np.zeros(shape=shape, dtype=dtype), "include", tf + ) + test_utils.create_header_file( + "output_data", np.zeros(shape=shape, dtype=dtype), "include", tf + ) - project, project_dir = _build_project( + project, project_dir = test_utils.build_project( temp_dir, board, west_cmd, diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py new file mode 100644 index 000000000000..350f7e242304 --- /dev/null +++ b/tests/micro/zephyr/test_zephyr_armv7m.py @@ -0,0 +1,225 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import logging +import os +import pathlib +import sys +import tarfile +import tempfile + +import pytest +import numpy as np + +import test_utils + +import tvm +import tvm.rpc +import tvm.micro +import tvm.testing +from tvm import relay + +from tvm.contrib.download import download_testdata +from tvm.micro.interface_api import generate_c_interface_header + +import conftest + + +_LOG = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def _open_tflite_model(): + # Import TFLite model + + model_url = "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/mnist_model_quant.tflite" + model_path = download_testdata(model_url, "mnist_model_quant.tflite", module="model") + + tflite_model_buf = open(model_path, "rb").read() + + try: + import tflite + + tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) + except AttributeError: + import tflite.Model + + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) + + relay_mod, params = relay.frontend.from_tflite(tflite_model) + + return relay_mod, params + + +def _get_test_data(testdata_dir): + + from PIL import Image + + image_files = ["digit-2.jpg"] + + for file in image_files: + img = Image.open(testdata_dir / file).resize((28, 28)) + img = np.asarray(img).astype("uint8") + sample = np.reshape(img, -1) + + output_shape = (1, 10) + + return sample, output_shape + + +def _apply_desired_layout_simd(relay_mod): + + desired_layouts = {"qnn.conv2d": ["NHWC", "HWOI"], "nn.conv2d": ["NHWC", "HWOI"]} + + seq = tvm.transform.Sequential( + [relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)] + ) + + with tvm.transform.PassContext(opt_level=3): + return seq(relay_mod) + + +def _apply_desired_layout_no_simd(relay_mod): + + desired_layouts = {"qnn.conv2d": ["NHWC", "HWIO"], "nn.conv2d": ["NHWC", "HWIO"]} + + seq = tvm.transform.Sequential( + [relay.transform.RemoveUnusedFunctions(), relay.transform.ConvertLayout(desired_layouts)] + ) + + with tvm.transform.PassContext(opt_level=3): + return seq(relay_mod) + + +def _generate_project(temp_dir, board, west_cmd, lowered, build_config, sample, output_shape): + + with tempfile.NamedTemporaryFile() as tar_temp_file: + with tarfile.open(tar_temp_file.name, "w:gz") as tf: + with tempfile.TemporaryDirectory() as tar_temp_dir: + model_files_path = os.path.join(tar_temp_dir, "include") + os.mkdir(model_files_path) + test_utils.loadCMSIS(model_files_path) + tf.add(model_files_path, arcname=os.path.relpath(model_files_path, tar_temp_dir)) + header_path = generate_c_interface_header( + lowered.libmod_name, ["input_1"], ["output"], model_files_path + ) + tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir)) + + test_utils.create_header_file("input_data", sample, "include", tf) + test_utils.create_header_file( + "output_data", np.zeros(shape=output_shape, dtype="float32"), "include", tf + ) + + project, _ = test_utils.build_project( + temp_dir, + board, + west_cmd, + lowered, + build_config, + extra_files_tar=tar_temp_file.name, + ) + + return project + + +def _run_model(temp_dir, board, west_cmd, lowered, build_config, sample, output_shape): + + project = _generate_project( + temp_dir, board, west_cmd, lowered, build_config, sample, output_shape + ) + + project.flash() + + with project.transport() as transport: + timeout_read = 60 + transport.write(b"start\n", timeout_sec=5) + result_line = test_utils.get_message(transport, "#result", timeout_sec=timeout_read) + + result_line = result_line.strip("\n") + result_line = result_line.split(":") + result = int(result_line[1]) + time = int(result_line[2]) + logging.info(f"Result: {result}\ttime: {time} ms") + + return result, time + + +@tvm.testing.requires_micro +def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug): + """Testing a ARM v7m SIMD extension.""" + + if board not in [ + "mps2_an521", + "stm32f746xx_disco", + "nucleo_f746zg", + "nucleo_l4r5zi", + ]: + pytest.skip(msg="Platform does not support ARM v7m SIMD extenion.") + + model = test_utils.ZEPHYR_BOARDS[board] + + build_config = {"debug": tvm_debug} + + this_dir = pathlib.Path(os.path.dirname(__file__)) + testdata_dir = this_dir.parent / "testdata" / "mnist" + + relay_mod, params = _open_tflite_model() + + sample, output_shape = _get_test_data(testdata_dir) + + relay_mod_simd = _apply_desired_layout_simd(relay_mod) + # kernel layout "HWIO" is not supported by arm_cpu SIMD extension (see tvm\python\relay\op\strategy\arm_cpu.py) + relay_mod_no_simd = _apply_desired_layout_no_simd(relay_mod) + + target = tvm.target.target.micro( + model, + options=[ + "-keys=arm_cpu,cpu", + "-link-params=1", + "--executor=aot", + "--unpacked-api=1", + "--interface-api=c", + ], + ) + + temp_dir_simd = temp_dir / "simd" + temp_dir_no_simd = temp_dir / "nosimd" + + os.makedirs(temp_dir_simd, exist_ok=True) + os.makedirs(temp_dir_no_simd, exist_ok=True) + + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + lowered_simd = relay.build(relay_mod_simd, target, params=params) + lowered_no_simd = relay.build(relay_mod_no_simd, target, params=params) + result_simd, time_simd = _run_model( + temp_dir_simd, board, west_cmd, lowered_simd, build_config, sample, output_shape + ) + result_no_simd, time_no_simd = _run_model( + temp_dir_no_simd, board, west_cmd, lowered_no_simd, build_config, sample, output_shape + ) + + assert result_no_simd == result_simd == 2 + + # Time performance measurements on QEMU emulator are always equal to zero. + if board not in [ + "mps2_an521", + ]: + assert time_no_simd > time_simd + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:]))