From 30fe89b9edb4bd1be1eac0bedd2a621345721e1d Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 27 Aug 2020 19:54:53 +0100 Subject: [PATCH 01/10] Add dot product support for quantized convolution. We added two new intrinsics in: topi/arm_cpu/tensor_intrin.py, namely - mmla4x4: compute a matrix multiplication between tile A(4,4) and tile B(4,4) - mmla16x4: compute a matrix multiplication between tile A(rows,4) and tile B(4,16) Then we used those intrinsics in two separate strategies. We added the strategies in topi/arm_cpu/conv2d_int8.py and implemented the schedules in topi/arm_cpu/conv2d_gemm.py. In particular: - schedule_conv2d_gemm, when accelerated, packs matrix A, compute GEMM, and unpack the resulting matrix. This uses the mmla4x4 intrinsic - schedule_conv2d_gemm_hybrid doesn't do any packing on A and C which are in native form. This uses the mmla16x4 intrinsic Please note that for the limitations of `tensorize` we need to pad matrix A in both cases (when dimensions are not multiple of the tiling shape) Change-Id: Id0d818d84ffc458c6dad7983fd350a0f3d5db395 --- python/tvm/relay/op/strategy/arm_cpu.py | 38 +-- python/tvm/topi/arm_cpu/arm_utils.py | 29 +++ python/tvm/topi/arm_cpu/conv2d_alter_op.py | 95 +++++--- python/tvm/topi/arm_cpu/conv2d_gemm.py | 265 ++++++++++++++------- python/tvm/topi/arm_cpu/conv2d_int8.py | 101 ++++++-- python/tvm/topi/arm_cpu/tensor_intrin.py | 231 +++++++++++++++--- 6 files changed, 582 insertions(+), 177 deletions(-) create mode 100644 python/tvm/topi/arm_cpu/arm_utils.py diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 54624ce35b55..77e9a010629e 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -135,20 +135,25 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): name="conv2d_direct_simd.micro_dev", ) elif kernel_layout == "HWIO": - is_aarch64 = "aarch64" in str(isa.target) - + is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm() + has_dot_prod = topi.arm_cpu.arm_utils.is_fast_int8_on_arm() + if has_dot_prod and data.dtype in ["int8", "uint8"]: + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid), + name="conv2d_NHWC_quantized_hybrid.arm_cpu") if is_aarch64 and data.dtype in ["int8", "uint8"]: strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized.arm_cpu", - ) - - strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), - name="conv2d_nhwc_spatial_pack.arm_cpu", - ) + name="conv2d_NHWC_quantized.arm_cpu") + # TODO + # This strategy errors out when tuning. Let us comment it out + # but not remove. + # strategy.add_implementation( + # wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), + # wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), + # name="conv2d_nhwc_spatial_pack.arm_cpu") else: raise RuntimeError( "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout) @@ -328,12 +333,17 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ data = inputs[0] strategy = _op.OpStrategy() - if layout == "NHWC" and data.dtype in ["int8", "uint8"]: + interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform + hybrid_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid_without_transform + if layout == "NHWC" and data.dtype in ['int8', 'uint8']: strategy.add_implementation( - wrap_compute_conv2d_gemm(topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform), + wrap_compute_conv2d_gemm(interleaved_compute), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized_without_transform.arm_cpu", - ) + name="conv2d_NHWC_quantized_without_transform.arm_cpu") + strategy.add_implementation( + wrap_compute_conv2d_gemm(hybrid_compute), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid), + name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu") else: raise RuntimeError( "Unsupported conv2d_NHWC_quantized_without_transform layout {0}" diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py new file mode 100644 index 000000000000..35e43a36562e --- /dev/null +++ b/python/tvm/topi/arm_cpu/arm_utils.py @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +"""Arm target utility functions""" + +import tvm +def is_fast_int8_on_arm(): + """ Checks whether the hardware has support for fast Int8 arithmetic operations. """ + target = tvm.target.Target.current(allow_none=False) + return "+v8.2a" in target.mattr and "+dotprod" in target.mattr + +def is_aarch64_arm(): + """ Checks whether we are compiling for an AArch64 target. """ + target = tvm.target.Target.current(allow_none=False) + return 'aarch64' in target.attrs.get("mtriple", "") diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index 7bf7e42237d2..bb65cb2d4664 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -27,9 +27,47 @@ from ..nn import conv2d_alter_layout from ..util import get_const_tuple from ..x86.conv2d import _get_default_config as _get_x86_default_config +from .arm_utils import is_fast_int8_on_arm logger = logging.getLogger("topi") +def interleave_transpose_B(inputs, data, kernel, interleave_A): + """ Return the new placeholder and the expression that represent + the matrix B transposed and interleaved""" + + assert (data.dtype == 'int8' and kernel.dtype == 'int8' or + data.dtype == 'uint8' and kernel.dtype == 'uint8') + + KH, KW, IC, OC = get_const_tuple(kernel.shape) + K = KH * KW * IC + N = OC + + if is_fast_int8_on_arm(): + tile_rows_B = 12 if interleave_A else 16 + tile_cols_B = 4 + else: + tile_rows_B = 4 + tile_cols_B = 16 + + pad_K = 0 + pad_N = 0 + + if N % tile_rows_B != 0: + pad_N = tile_rows_B - (N % tile_rows_B) + if K % tile_cols_B != 0: + pad_K = tile_cols_B - (K % tile_cols_B) + + N_padded = N + pad_N + K_padded = K + pad_K + new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], + tile_rows_B, + tile_cols_B) + new_kernel = te.placeholder((N_padded // tile_rows_B, + K_padded // tile_cols_B, + tile_rows_B, + tile_cols_B), kernel.dtype) + return new_kernel, new_kernel_expr + @conv2d_alter_layout.register(["arm_cpu"]) def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): @@ -280,43 +318,34 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs) if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu": - assert ( - data.dtype == "int8" - and kernel.dtype == "int8" - or data.dtype == "uint8" - and kernel.dtype == "uint8" - ) assert data_layout == "NHWC" and kernel_layout == "HWIO" KH, KW, IC, OC = get_const_tuple(kernel.shape) - K = KH * KW * IC N = OC - - tile_rows = 4 - tile_cols = 16 - pad_K = 0 - pad_N = 0 - - if N % tile_rows != 0: - pad_N = tile_rows - (N % tile_rows) - if K % tile_cols != 0: - pad_K = tile_cols - (K % tile_cols) - - N_padded = N + pad_N - K_padded = K + pad_K - kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], tile_rows, tile_cols) - new_kernel = te.placeholder( - (N_padded // tile_rows, K_padded // tile_cols, tile_rows, tile_cols), kernel.dtype - ) - new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu" - new_workload = autotvm.task.args_to_workload( - [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC], - new_workload_name, - ) + new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data, + kernel, interleave_A=True) + new_workload = autotvm.task.args_to_workload([data, new_kernel, + strides, padding, dilation, + out_dtype, (KH, KW), OC], + new_workload_name) dispatch_ctx.update(target, new_workload, cfg) - return relay.nn.contrib_conv2d_gemm_without_weight_transform( - inputs[0], kernel_expr, **new_attrs - ) - + return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0], + new_kernel_expr, + **new_attrs) + if topi_tmpl == "conv2d_NHWC_quantized_hybrid.arm_cpu": + assert data_layout == "NHWC" and kernel_layout == "HWIO" + KH, KW, IC, OC = get_const_tuple(kernel.shape) + N = OC + new_workload_name = "conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu" + new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data, + kernel, interleave_A=False) + new_workload = autotvm.task.args_to_workload([data, new_kernel, + strides, padding, dilation, + out_dtype, (KH, KW), OC], + new_workload_name) + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0], + new_kernel_expr, + **new_attrs) return None diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py index 7f73cc828fa7..14a192b38821 100644 --- a/python/tvm/topi/arm_cpu/conv2d_gemm.py +++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py @@ -23,19 +23,44 @@ from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity from ..util import get_const_tuple, get_const_int from ..nn.util import get_pad_tuple -from .tensor_intrin import gemm_quantized, gemm_quantized_impl +from .tensor_intrin import gemm_quantized, gemm_quantized_impl,\ + mmla_4x4_int8_int8_int32, mmla_16x4_int8_int8_int32 +from .arm_utils import is_aarch64_arm, is_fast_int8_on_arm +def configure_knobs(cfg, M, K): + """ Configure auto-tuning knobs for the interleaved strategy """ -def is_aarch64_arm(): - """ Checks whether we are compiling for an AArch64 target. """ - target = tvm.target.Target.current(allow_none=False) - return "aarch64" in target.attrs.get("mtriple", "") + x, y = cfg.axis(M // 4), cfg.axis(K // 16) + cfg.define_reorder('reorder_gemm', + [x, y], + policy='candidate', + candidate=[[x, y], + [y, x]]) + outer_loop, inner_loop = cfg.axis(4), cfg.axis(16) + cfg.define_annotate("A_interleaved_unroll_vec", + [outer_loop, inner_loop], + policy="try_unroll_vec") + + # Fallback configuration + if cfg.is_fallback: + cfg['reorder_gemm'] = ReorderEntity([0, 1]) + cfg['A_interleaved_unroll_vec'] = AnnotateEntity(["unroll", "vec"]) + + if not is_fast_int8_on_arm(): + cfg.define_knob('gemm_quantized_unroll', [True, False]) + cfg.define_knob('gemm_quantized_interleave', [True, False]) + + if cfg.is_fallback: + cfg['gemm_quantized_unroll'] = OtherOptionEntity(False) + cfg['gemm_quantized_interleave'] = OtherOptionEntity(True) # Compute function -def compute_conv2d_gemm_without_weight_transform( - cfg, data, B_interleaved_t, strides, padding, dilation, out_dtype, kernel_size, output_channels -): +def compute_conv2d_gemm_without_weight_transform(cfg, + data, B_interleaved_t, + strides, padding, dilation, + out_dtype, kernel_size, + output_channels, interleave_A): """Compute conv2d by transforming the input, executing GEMM and transforming the output back""" batches, IH, IW, IC = get_const_tuple(data.shape) @@ -74,38 +99,38 @@ def compute_conv2d_gemm_without_weight_transform( A_shape = (batches, M, K) if K_AREA == 1: - A = te.compute( - A_shape, - lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y], - name="data_flatten", - ) + A = tvm.topi.reshape(data_pad, A_shape) else: - A = te.compute( - A_shape, - lambda n, x, y: data_pad[ - n, - HSTR * (x // OW) + dilation_h * ((y // IC) // KW), - WSTR * (x % OW) + dilation_w * ((y // IC) % KW), - y % IC, - ], - name="data_im2col", - ) - N_transformed = B_interleaved_t.shape[0] + A = te.compute(A_shape, lambda n, x, y: + data_pad[n, + HSTR * (x // OW) + dilation_h * ((y // IC) // KW), + WSTR * (x % OW) + dilation_w * ((y // IC) % KW), y % IC], + name='data_im2col') # --- Pad if necessary - idxm = tvm.tir.indexmod + N_transformed = B_interleaved_t.shape[0] + tile_rows_B = B_interleaved_t.shape[2] + tile_cols_B = B_interleaved_t.shape[3] + + if is_fast_int8_on_arm() and interleave_A: + tile_rows_A = 8 + tile_cols_A = 4 + else: + tile_rows_A = 4 + tile_cols_A = 16 pad_m = 0 pad_k = 0 - if M % 4 != 0: - pad_m = 4 - (M % 4) + if M % tile_rows_A != 0: + pad_m = tile_rows_A - (M % tile_rows_A) - if K % 16 != 0: - pad_k = 16 - (K % 16) + if K % tile_cols_A != 0: + pad_k = tile_cols_A - (K % tile_cols_A) M_padded = M + pad_m K_padded = K + pad_k + N_padded = N_transformed * tile_rows_B pad_before = (0, 0, 0) pad_after = (0, pad_m, pad_k) @@ -113,59 +138,75 @@ def compute_conv2d_gemm_without_weight_transform( if pad_m != 0 or pad_k != 0: A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded") - # --- GEMM: A*B' + idxm = tvm.tir.indexmod k = te.reduce_axis((0, K_padded), "k") - A_interleaved = te.compute( - (batches, M_padded // 4, K_padded // 16, 4, 16), - lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y], - name="A_interleaved", - ) - - C_interleaved = te.compute( - (batches, M_padded // 4, N_transformed, 4, 4), - lambda b, x, y, w, z: te.sum( - A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype) - * B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype), - axis=k, - ), - name="C_interleaved", - ) - - # --- Unpack C - C = te.compute( - (batches, M, N), - lambda b, x, y: C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)], - name="C", - ) + if interleave_A: + # Configuration space + configure_knobs(cfg, M_padded, K_padded) + + # Pack A + A_interleaved = te.compute((batches, + M_padded // tile_rows_A, + K_padded // tile_cols_A, + tile_rows_A, + tile_cols_A), + lambda b, x, y, z, w: A[b, + z + tile_rows_A * x, + w + tile_cols_A * y], + name='A_interleaved') + # Compute C + C_interleaved = te.compute((batches, + M_padded // tile_rows_A, + N_transformed, + tile_rows_A, + tile_rows_B), + lambda b, x, y, w, z: + te.sum(A_interleaved[b, + x, + k//tile_cols_A, + w, + idxm(k, tile_cols_A)].astype('int32')* + B_interleaved_t[y, + k//tile_cols_B, + z, + idxm(k, tile_cols_B)].astype('int32'), + axis=k), + name='C_interleaved') + # Unpack C + C = te.compute((batches, M, N), + lambda b, x, y: + C_interleaved[b, + x // tile_rows_A, + y // tile_rows_B, + idxm(x, tile_rows_A), + idxm(y, tile_rows_B)].astype(out_dtype), + name="C") + zero = tvm.tir.const(0) + else: + # No need to pack/unpack + C = te.compute((batches, + M_padded, + N_padded), + lambda b, x, y: + te.sum(A[b, x, k].astype('int32')* + B_interleaved_t[y//tile_rows_B, + k//tile_cols_B, + idxm(y, tile_rows_B), + idxm(k, tile_cols_B)].astype('int32'), + axis=k), + name='C') + zero = tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1] - \ + tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1] # --- Produce the conv output out_shape = (batches, OH, OW, OC) - out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z), name="conv2d_gemm_output") - - # Configuration space - x, y = cfg.axis(M_padded // 4), cfg.axis(K_padded // 16) - cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]]) - - outer_loop, inner_loop = cfg.axis(4), cfg.axis(16) - cfg.define_annotate( - "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec" - ) - cfg.define_knob("gemm_quantized_unroll", [True, False]) - cfg.define_knob("gemm_quantized_interleave", [True, False]) - - # Fallback configuration - if cfg.is_fallback: - cfg["reorder_gemm"] = ReorderEntity([0, 1]) - cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"]) - cfg["gemm_quantized_unroll"] = OtherOptionEntity(False) - cfg["gemm_quantized_interleave"] = OtherOptionEntity(True) + out = te.compute(out_shape, lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype), + name='conv2d_gemm_output') return out - -# Schedules def schedule_conv2d_gemm(cfg, s, out, final_out): - """Create schedule for tensors""" + """ Schedule the conv2d_gemm interleaved strategy """ C = out.op.input_tensors[0] C_interleaved = C.op.input_tensors[0] A_interleaved = C_interleaved.op.input_tensors[0] @@ -192,8 +233,8 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): # Computation(through tensorize) b, xo, yo, xi, yi = C_interleaved.op.axis - outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo]) - s[C_interleaved].reorder(yi, xi) + outer_gemm, inner_gemm = cfg['reorder_gemm'].apply(s, C_interleaved, [xo, yo]) + b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm) s[C_interleaved].parallel(b_outer_gemm_fused) s[A_interleaved].compute_at(s[C_interleaved], b_outer_gemm_fused) @@ -204,12 +245,29 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): in_type = A_interleaved.dtype out_type = C.dtype - if is_aarch64_arm() and out_type == "int32": + + k = C_interleaved.op.reduce_axis[0] + _, M, N = C.shape + if is_fast_int8_on_arm(): + mmla = mmla_4x4_int8_int8_int32(in_type) + xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(xi, + yi, + x_factor=8, + y_factor=4) + k_outer, k_inner = s[C_interleaved].split(k, 4) + xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4) + s[C_interleaved].reorder(b_outer_gemm_fused, inner_gemm, xi_outer, + yi_outer, k_outer, xi_inner_outer, xi_inner_inner, + yi_inner, k_inner) + s[C_interleaved].tensorize(xi_inner_inner, mmla) + s[C_interleaved].unroll(xi_inner_outer) + + elif is_aarch64_arm(): + s[C_interleaved].reorder(yi, xi) K = A_interleaved_input.shape[2] - _, M, N = C.shape - assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported" - unroll = cfg["gemm_quantized_unroll"].val - interleave = cfg["gemm_quantized_interleave"].val + assert in_type in ['int8', 'uint8'], "Only int8 and uint8 gemm are supported" + unroll = cfg['gemm_quantized_unroll'].val + interleave = cfg['gemm_quantized_interleave'].val gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type) s[C_interleaved].pragma( b_outer_gemm_fused, @@ -218,6 +276,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): ) s[C_interleaved].tensorize(yi, gemm) + # Output transform if out != final_out: n, h, w, c = out.op.axis @@ -225,3 +284,47 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): s[C].compute_at(s[out], inner) s[out].vectorize(inner) return s + +def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out): + """ Schedule the conv2d_gemm hybrid strategy """ + C = out.op.input_tensors[0] + A = C.op.input_tensors[0] + in_type = A.dtype + + # Computation + b, x, y = C.op.axis + k, = C.op.reduce_axis + k_outer, k_inner = s[C].split(k, 16) + x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16) + s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner) + mmla = mmla_16x4_int8_int8_int32(in_type, rows=1) + s[C].unroll(x_inner) + s[C].tensorize(y_inner, mmla) + s[C].parallel(x_outer) + + # Input transform + if A.op.name == "A_padded": + padding_A = True + data_im2col = A.op.input_tensors[0] + else: + padding_A = False + data_im2col = A + + b, m, n = data_im2col.op.axis + if data_im2col.op.name == "data_im2col": + n_outer, n_inner = s[data_im2col].split(n, 16) + s[data_im2col].unroll(n_outer) + s[data_im2col].vectorize(n_inner) + s[data_im2col].parallel(m) + elif padding_A: + s[data_im2col].compute_inline() + s[A].compute_at(s[C], x_inner) + else: + s[data_im2col].compute_at(s[C], x_inner) + + # Output transform + if out != final_out: + n, h, w, c = out.op.axis + _, inner = s[out].split(c, 4) + s[out].vectorize(inner) + return s diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 307f9e102acf..80ed67ca79a5 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -24,8 +24,9 @@ from .. import nn from ..nn.conv2d import _get_workload as _get_conv2d_workload from .tensor_intrin import dot_int8_int8_int32 -from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform, schedule_conv2d_gemm - +from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform,\ + schedule_conv2d_gemm, schedule_conv2d_gemm_hybrid +from .arm_utils import is_fast_int8_on_arm def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): """ @@ -40,6 +41,19 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): cfg, wkl, int32_lanes=2, num_int8_elements=4 ) +def get_tiling_B(interleave_A): + """ Compute the tiling information for matrix B in A*B=C """ + if is_fast_int8_on_arm(): + # Two options in case dotprod is enabled + tile_cols_B = 4 + tile_rows_B = 12 if interleave_A else 16 + else: + # Single option if dotprod is not available + tile_rows_B = 4 + tile_cols_B = 16 + + return tile_rows_B, tile_cols_B + @autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu") def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): @@ -115,30 +129,27 @@ def traverse(op): return s -@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu") -def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype): +def _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, + dilation, out_dtype, interleave_A): N, IH, IW, IC = get_const_tuple(data.shape) KH, KW, _, OC = get_const_tuple(kernel.shape) - tile_rows = 4 - tile_cols = 16 - kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols) - return compute_conv2d_gemm_without_weight_transform( - cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC - ) + tile_rows_B, tile_cols_B = get_tiling_B(interleave_A) + kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B) + return compute_conv2d_gemm_without_weight_transform(cfg, + data, kernel, strides, padding, + dilation, out_dtype, (KH, KW), + OC, interleave_A) -@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu") -def compute_conv2d_NHWC_quantized_without_transform( - cfg, data, B, strides, padding, dilation, out_dtype, kernel_size=None, output_channels=None -): - """Compute for conv2d_NHWC_quantized without weight transform.""" - return compute_conv2d_gemm_without_weight_transform( - cfg, data, B, strides, padding, dilation, out_dtype, kernel_size, output_channels - ) +def _compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding, + dilation, out_dtype, kernel_size=None, + output_channels=None, interleave_A=False): + return compute_conv2d_gemm_without_weight_transform(cfg, data, B, strides, padding, + dilation, out_dtype, kernel_size, + output_channels, interleave_A) -@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu") -def schedule_conv2d_NHWC_quantized(cfg, outs): +def _schedule_conv2d_NHWC_quantized(cfg, outs, interleave_A): """Create schedule for tensors""" s = te.create_schedule([x.op for x in outs]) # Vectorize the output and then inline all the rest @@ -153,12 +164,58 @@ def _callback(op): """Traverse operators from computation graph""" if op.name == "conv2d_gemm_output": conv_out = op.output(0) - schedule_conv2d_gemm(cfg, s, conv_out, out) + if interleave_A: + schedule_conv2d_gemm(cfg, s, conv_out, out) + else: + schedule_conv2d_gemm_hybrid(cfg, s, conv_out, out) if out != conv_out: s[conv_out].compute_at(s[out], inner) else: C = conv_out.op.input_tensors[0] - s[C].compute_at(s[out], inner) + if interleave_A: + s[C].compute_at(s[out], inner) traverse_inline(s, outs[0].op, _callback) return s + +#### Interleaved schedules +@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu") +def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype): + """ Interface for interleaved compute_conv2d_NHWC_quantized""" + return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, + padding, dilation, out_dtype, True) + +@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu") +def compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding, dilation, + out_dtype, kernel_size, output_channels): + """ Interface for interleaved compute_conv2d_NHWC_quantized_without_transform""" + return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, + strides, padding, dilation, + out_dtype, kernel_size, + output_channels, True) + +@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu") +def schedule_conv2d_NHWC_quantized(cfg, outs): + """ Interface for interleaved schedule_conv2d_NHWC_quantized""" + return _schedule_conv2d_NHWC_quantized(cfg, outs, True) + +#### Hybrid schedules (A non interleaved, B is interleaved and transposed) +@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid.arm_cpu") +def compute_conv2d_NHWC_quantized_hybrid(cfg, data, kernel, strides, padding, dilation, out_dtype): + """ Interface for hybrid compute_conv2d_NHWC_quantized""" + return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, + padding, dilation, out_dtype, False) + +@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu") +def compute_conv2d_NHWC_quantized_hybrid_without_transform(cfg, data, kernel, strides, padding, + dilation, out_dtype, + kernel_size, output_channels): + """ Interface for hybrid compute_conv2d_NHWC_quantized_without_transform""" + return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding, + dilation, out_dtype, kernel_size, + output_channels, False) + +@autotvm.register_topi_schedule("conv2d_NHWC_quantized_hybrid.arm_cpu") +def schedule_conv2d_NHWC_quantized_hybrid(cfg, outs): + """ Interface for hybrid schedule_conv2d_NHWC_quantized""" + return _schedule_conv2d_NHWC_quantized(cfg, outs, False) diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index e87bdc47d0b0..ea55cd99e876 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -417,36 +417,19 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type): idxm = tvm.tir.indexmod k = te.reduce_axis((0, K), "k") + C = te.compute((te.var("m"), te.var("n")), + lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype('int32') * + B[k // 16, y, idxm(k, 16)].astype('int32'), + axis=k), name="C") - C = te.compute( - (te.var("m"), te.var("n")), - lambda x, y: te.sum( - A[k // 16, x, idxm(k, 16)].astype(out_type) - * B[k // 16, y, idxm(k, 16)].astype(out_type), - axis=k, - ), - name="C", - ) + a_buffer = tvm.tir.decl_buffer(A.shape, dtype=in_type, name="a_buffer", + offset_factor=1, strides=[te.var('sa_1'), te.var('sa_2'), 1]) - a_buffer = tvm.tir.decl_buffer( - A.shape, - dtype=in_type, - name="a_buffer", - offset_factor=1, - strides=[te.var("sa_1"), te.var("sa_2"), 1], - ) - - b_buffer = tvm.tir.decl_buffer( - B.shape, - dtype=in_type, - name="b_buffer", - offset_factor=1, - strides=[te.var("sb_1"), te.var("sb_2"), 1], - ) + b_buffer = tvm.tir.decl_buffer(B.shape, dtype=in_type, name="b_buffer", + offset_factor=1, strides=[te.var('sb_1'), te.var('sb_2'), 1]) - c_buffer = tvm.tir.decl_buffer( - C.shape, dtype=out_type, name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1] - ) + c_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="c_buffer", + offset_factor=1, strides=[te.var('sc'), 1]) def _intrin_func(ins, outs): def _instr(): @@ -589,6 +572,200 @@ def _instr(index): ) +def select_word(vec, lane, dtype_vec): + """ + Utility function used to select a int8x4 word within a int8x16 vector + and replicate 4 times. + The pseudo-code for this operation is: + + v = [x0, ..., x15] + vsub(i) = v[i:i+3] + replicated_v(i) = [vsub(i), vsub(i), vsub(i), vsub(i)] + + Note that i can vary between 0 and 3 + """ + # Reinterpret vec_a as 4 int32 words + vec_int32 = tvm.tir.call_intrin('int32x4', 'tir.reinterpret', vec) + # Broadcast the lane-th word + vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane]) + # Convert back to uint8x16 + vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, 'tir.reinterpret', vec_int32_shuffled) + return vec_int8_broadcast + +def mmla_4x4_int8_int8_int32(dtype): + """ + Int8 4x4 matrix multiplication using sdot/udot instructions + This function takes two arrays of int8 datatype -- A[4][4] and + B[4][4] and produces a 4x4 matrix which is equal to A*B + The pseudo code is as follows. + + .. code-block:: c + + void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 output[4][4]){ + for (int i = 0; i < 4; i++){ + for (int j = 0; i < 4; i++){ + out[i][j] = 0; + for (int k = 0; k < 4; k++){ + out[i][j] += A[i][k] * B[j][k] + } + } + } + + Notes: + * The rows of matrix B are transposed + * Matrix A is interleaved + This function returns a TensorIntrin that can be used to tensorize a schedule. + + Parameters + ---------- + dtype: str, {"uint8", "int8"} + Whether it works on unsigned int or signed int + + Returns + ------- + intrin : TensorIntrin + The Arm TensorIntrin that can be used in tensorizing schedule + """ + data = te.placeholder((te.var("rows"), 4), dtype, name='data') + kernel = te.placeholder((4, 4), dtype, name='kernel') + dtype_vec = dtype + 'x16' + + k = te.reduce_axis((0, 4), name='k') + C = te.compute((te.var("rows"), 4), + lambda i, j: te.sum(data[i, k].astype('int32') * + kernel[j, k].astype('int32'), + axis=k), name="C") + + aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer", + offset_factor=1, + strides=[te.var('sa'), 1]) + bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer", + offset_factor=1, + strides=[te.var('sb'), 1]) + cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer", + offset_factor=1, + strides=[te.var('sc'), 1]) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.tir.ir_builder.create() + if index == 1: + for i in range(0, 4): + ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x4'))) + return ib.get() + + vec_a = ins[0].vload([0, 0]) + vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)] + vec_b = ins[1].vload([0, 0], dtype_vec) + + # Execute the dot product + for i in range(0, 4): + vec_c = outs[0].vload([i, 0], 'int32x4') + vdot = tvm.tir.call_llvm_intrin( + 'int32x4', + 'llvm.aarch64.neon.sdot', + tvm.tir.const(3, 'uint32'), + vec_c, vec_b, vec_aa[i]) + + # Store the result + ib.emit(outs[0].vstore([i, 0], vdot)) + + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + buffer_params = {"offset_factor": 1} + return te.decl_tensor_intrin( + C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer}, + default_buffer_params=buffer_params) + +def mmla_16x4_int8_int8_int32(dtype, rows): + """ + Int8 16x4 matrix multiplication using sdot/udot instructions + This function takes two arrays of int8 datatype -- A[rows][4] and + B[4][16] and produces a rowsx16 matrix which is equal to A*B + The pseudo code is as follows. + + .. code-block:: c + + void mmla_16x4_int8_int8_int32(int8 A[rows][4], int8 B[4][16], int32 output[rows][16]){ + for (int i = 0; i < rows; i++){ + for (int j = 0; i < 16; i++){ + out[i][j] = 0; + for (int k = 0; k < 4; k++){ + out[i][j] += A[i][k] * B[j][k] + } + } + } + + Notes: + * The rows of matrix B are transposed + * A is not interleaved, but used in its native form + This function returns a TensorIntrin that can be used to tensorize a schedule. + + Parameters + ---------- + dtype: str, {"uint8", "int8"} + Whether it works on unsigned int or signed int + + Returns + ------- + intrin : TensorIntrin + The Arm TensorIntrin that can be used in tensorizing schedule + """ + data = te.placeholder((rows, 16), dtype, name='data') + kernel = te.placeholder((4, 16, 4), dtype, name='kernel') + dtype_vec = dtype + 'x16' + idxm = tvm.tir.indexmod + k = te.reduce_axis((0, 16), name='k') + C = te.compute((rows, 16), + lambda i, j: te.sum(data[i, k].astype('int32') * + kernel[k//4, j, idxm(k, 4)].astype('int32'), + axis=k), name="C") + + aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer", + offset_factor=1, + strides=[te.var('sa'), 1]) + bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer", + offset_factor=1, + strides=[te.var('sb0'), te.var('sb1'), 1]) + cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer", + offset_factor=1, + strides=[te.var('sc'), 1]) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.tir.ir_builder.create() + if index == 1: + for i in range(0, rows): + ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x16'))) + return ib.get() + + for k in range(0, rows): + vec_a = ins[0].vload([k, 0], dtype_vec) + + for j in range(0, 4): + for i in range(0, 4): + vec_aa = select_word(vec_a, i, dtype_vec) + vec_b = ins[1].vload([i, 4*j, 0], dtype_vec) + vec_c = outs[0].vload([k, 4*j], 'int32x4') + vdot = tvm.tir.call_llvm_intrin( + 'int32x4', + 'llvm.aarch64.neon.sdot', + tvm.tir.const(3, 'uint32'), + vec_c, vec_b, vec_aa) + ib.emit(outs[0].vstore([k, 4*j], vdot)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + buffer_params = {"offset_factor": 1} + return te.decl_tensor_intrin( + C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer}, + default_buffer_params=buffer_params) + def _q_multiply_shift_arm(op): """ Implementation of q_multiply_shift_arm through arm intrinsics From 8b69ffd1e76bdf56e044f247a439a96fe044bc92 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 10 Sep 2020 19:52:45 +0100 Subject: [PATCH 02/10] Add back nhwc_spatial_pack strategy as default Change-Id: I8b1826a7ae1d742956296e8d157da19955a4942c --- python/tvm/relay/op/strategy/arm_cpu.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 77e9a010629e..78251c89bd6f 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -147,13 +147,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), name="conv2d_NHWC_quantized.arm_cpu") - # TODO - # This strategy errors out when tuning. Let us comment it out - # but not remove. - # strategy.add_implementation( - # wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), - # wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), - # name="conv2d_nhwc_spatial_pack.arm_cpu") + if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]): + # TODO + # This strategy errors out for quantized data types when tuning. + # Let's use this only for non-aarch64 or non-quantized cases + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), + name="conv2d_nhwc_spatial_pack.arm_cpu") else: raise RuntimeError( "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout) From 16292e921c16c6f37cd1012e0cd3a7779ae0c850 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Wed, 16 Sep 2020 11:13:58 +0100 Subject: [PATCH 03/10] Fix linting through Black Change-Id: Ic74ef5461a90bca9f4d4980a214137e384d5f923 --- python/tvm/relay/op/strategy/arm_cpu.py | 17 +- python/tvm/topi/arm_cpu/arm_utils.py | 5 +- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 60 ++++--- python/tvm/topi/arm_cpu/conv2d_gemm.py | 190 +++++++++++---------- python/tvm/topi/arm_cpu/conv2d_int8.py | 101 +++++++---- python/tvm/topi/arm_cpu/tensor_intrin.py | 172 +++++++++++-------- 6 files changed, 326 insertions(+), 219 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 78251c89bd6f..9fe6e28e6f84 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -141,12 +141,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid), - name="conv2d_NHWC_quantized_hybrid.arm_cpu") + name="conv2d_NHWC_quantized_hybrid.arm_cpu", + ) if is_aarch64 and data.dtype in ["int8", "uint8"]: strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized.arm_cpu") + name="conv2d_NHWC_quantized.arm_cpu", + ) if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]): # TODO # This strategy errors out for quantized data types when tuning. @@ -154,7 +156,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), - name="conv2d_nhwc_spatial_pack.arm_cpu") + name="conv2d_nhwc_spatial_pack.arm_cpu", + ) else: raise RuntimeError( "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout) @@ -336,15 +339,17 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform hybrid_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid_without_transform - if layout == "NHWC" and data.dtype in ['int8', 'uint8']: + if layout == "NHWC" and data.dtype in ["int8", "uint8"]: strategy.add_implementation( wrap_compute_conv2d_gemm(interleaved_compute), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized_without_transform.arm_cpu") + name="conv2d_NHWC_quantized_without_transform.arm_cpu", + ) strategy.add_implementation( wrap_compute_conv2d_gemm(hybrid_compute), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid), - name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu") + name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu", + ) else: raise RuntimeError( "Unsupported conv2d_NHWC_quantized_without_transform layout {0}" diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py index 35e43a36562e..b9c7010d6ee3 100644 --- a/python/tvm/topi/arm_cpu/arm_utils.py +++ b/python/tvm/topi/arm_cpu/arm_utils.py @@ -18,12 +18,15 @@ """Arm target utility functions""" import tvm + + def is_fast_int8_on_arm(): """ Checks whether the hardware has support for fast Int8 arithmetic operations. """ target = tvm.target.Target.current(allow_none=False) return "+v8.2a" in target.mattr and "+dotprod" in target.mattr + def is_aarch64_arm(): """ Checks whether we are compiling for an AArch64 target. """ target = tvm.target.Target.current(allow_none=False) - return 'aarch64' in target.attrs.get("mtriple", "") + return "aarch64" in target.attrs.get("mtriple", "") diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index bb65cb2d4664..d669fa695e58 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -31,12 +31,17 @@ logger = logging.getLogger("topi") + def interleave_transpose_B(inputs, data, kernel, interleave_A): """ Return the new placeholder and the expression that represent the matrix B transposed and interleaved""" - assert (data.dtype == 'int8' and kernel.dtype == 'int8' or - data.dtype == 'uint8' and kernel.dtype == 'uint8') + assert ( + data.dtype == "int8" + and kernel.dtype == "int8" + or data.dtype == "uint8" + and kernel.dtype == "uint8" + ) KH, KW, IC, OC = get_const_tuple(kernel.shape) K = KH * KW * IC @@ -59,13 +64,12 @@ def interleave_transpose_B(inputs, data, kernel, interleave_A): N_padded = N + pad_N K_padded = K + pad_K - new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], - tile_rows_B, - tile_cols_B) - new_kernel = te.placeholder((N_padded // tile_rows_B, - K_padded // tile_cols_B, - tile_rows_B, - tile_cols_B), kernel.dtype) + new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform( + inputs[1], tile_rows_B, tile_cols_B + ) + new_kernel = te.placeholder( + (N_padded // tile_rows_B, K_padded // tile_cols_B, tile_rows_B, tile_cols_B), kernel.dtype + ) return new_kernel, new_kernel_expr @@ -322,30 +326,32 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): KH, KW, IC, OC = get_const_tuple(kernel.shape) N = OC new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu" - new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data, - kernel, interleave_A=True) - new_workload = autotvm.task.args_to_workload([data, new_kernel, - strides, padding, dilation, - out_dtype, (KH, KW), OC], - new_workload_name) + new_kernel, new_kernel_expr = interleave_transpose_B( + inputs, data, kernel, interleave_A=True + ) + new_workload = autotvm.task.args_to_workload( + [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC], + new_workload_name, + ) dispatch_ctx.update(target, new_workload, cfg) - return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0], - new_kernel_expr, - **new_attrs) + return relay.nn.contrib_conv2d_gemm_without_weight_transform( + inputs[0], new_kernel_expr, **new_attrs + ) if topi_tmpl == "conv2d_NHWC_quantized_hybrid.arm_cpu": assert data_layout == "NHWC" and kernel_layout == "HWIO" KH, KW, IC, OC = get_const_tuple(kernel.shape) N = OC new_workload_name = "conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu" - new_kernel, new_kernel_expr = interleave_transpose_B(inputs, data, - kernel, interleave_A=False) - new_workload = autotvm.task.args_to_workload([data, new_kernel, - strides, padding, dilation, - out_dtype, (KH, KW), OC], - new_workload_name) + new_kernel, new_kernel_expr = interleave_transpose_B( + inputs, data, kernel, interleave_A=False + ) + new_workload = autotvm.task.args_to_workload( + [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC], + new_workload_name, + ) dispatch_ctx.update(target, new_workload, cfg) - return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0], - new_kernel_expr, - **new_attrs) + return relay.nn.contrib_conv2d_gemm_without_weight_transform( + inputs[0], new_kernel_expr, **new_attrs + ) return None diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py index 14a192b38821..09c53001963f 100644 --- a/python/tvm/topi/arm_cpu/conv2d_gemm.py +++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py @@ -23,44 +23,53 @@ from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity from ..util import get_const_tuple, get_const_int from ..nn.util import get_pad_tuple -from .tensor_intrin import gemm_quantized, gemm_quantized_impl,\ - mmla_4x4_int8_int8_int32, mmla_16x4_int8_int8_int32 +from .tensor_intrin import ( + gemm_quantized, + gemm_quantized_impl, + mmla_4x4_int8_int8_int32, + mmla_16x4_int8_int8_int32, +) from .arm_utils import is_aarch64_arm, is_fast_int8_on_arm + def configure_knobs(cfg, M, K): """ Configure auto-tuning knobs for the interleaved strategy """ x, y = cfg.axis(M // 4), cfg.axis(K // 16) - cfg.define_reorder('reorder_gemm', - [x, y], - policy='candidate', - candidate=[[x, y], - [y, x]]) + cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]]) outer_loop, inner_loop = cfg.axis(4), cfg.axis(16) - cfg.define_annotate("A_interleaved_unroll_vec", - [outer_loop, inner_loop], - policy="try_unroll_vec") + cfg.define_annotate( + "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec" + ) # Fallback configuration if cfg.is_fallback: - cfg['reorder_gemm'] = ReorderEntity([0, 1]) - cfg['A_interleaved_unroll_vec'] = AnnotateEntity(["unroll", "vec"]) + cfg["reorder_gemm"] = ReorderEntity([0, 1]) + cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"]) if not is_fast_int8_on_arm(): - cfg.define_knob('gemm_quantized_unroll', [True, False]) - cfg.define_knob('gemm_quantized_interleave', [True, False]) + cfg.define_knob("gemm_quantized_unroll", [True, False]) + cfg.define_knob("gemm_quantized_interleave", [True, False]) if cfg.is_fallback: - cfg['gemm_quantized_unroll'] = OtherOptionEntity(False) - cfg['gemm_quantized_interleave'] = OtherOptionEntity(True) + cfg["gemm_quantized_unroll"] = OtherOptionEntity(False) + cfg["gemm_quantized_interleave"] = OtherOptionEntity(True) + # Compute function -def compute_conv2d_gemm_without_weight_transform(cfg, - data, B_interleaved_t, - strides, padding, dilation, - out_dtype, kernel_size, - output_channels, interleave_A): +def compute_conv2d_gemm_without_weight_transform( + cfg, + data, + B_interleaved_t, + strides, + padding, + dilation, + out_dtype, + kernel_size, + output_channels, + interleave_A, +): """Compute conv2d by transforming the input, executing GEMM and transforming the output back""" batches, IH, IW, IC = get_const_tuple(data.shape) @@ -101,11 +110,16 @@ def compute_conv2d_gemm_without_weight_transform(cfg, if K_AREA == 1: A = tvm.topi.reshape(data_pad, A_shape) else: - A = te.compute(A_shape, lambda n, x, y: - data_pad[n, - HSTR * (x // OW) + dilation_h * ((y // IC) // KW), - WSTR * (x % OW) + dilation_w * ((y // IC) % KW), y % IC], - name='data_im2col') + A = te.compute( + A_shape, + lambda n, x, y: data_pad[ + n, + HSTR * (x // OW) + dilation_h * ((y // IC) // KW), + WSTR * (x % OW) + dilation_w * ((y // IC) % KW), + y % IC, + ], + name="data_im2col", + ) # --- Pad if necessary N_transformed = B_interleaved_t.shape[0] @@ -146,65 +160,58 @@ def compute_conv2d_gemm_without_weight_transform(cfg, configure_knobs(cfg, M_padded, K_padded) # Pack A - A_interleaved = te.compute((batches, - M_padded // tile_rows_A, - K_padded // tile_cols_A, - tile_rows_A, - tile_cols_A), - lambda b, x, y, z, w: A[b, - z + tile_rows_A * x, - w + tile_cols_A * y], - name='A_interleaved') + A_interleaved = te.compute( + (batches, M_padded // tile_rows_A, K_padded // tile_cols_A, tile_rows_A, tile_cols_A), + lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y], + name="A_interleaved", + ) # Compute C - C_interleaved = te.compute((batches, - M_padded // tile_rows_A, - N_transformed, - tile_rows_A, - tile_rows_B), - lambda b, x, y, w, z: - te.sum(A_interleaved[b, - x, - k//tile_cols_A, - w, - idxm(k, tile_cols_A)].astype('int32')* - B_interleaved_t[y, - k//tile_cols_B, - z, - idxm(k, tile_cols_B)].astype('int32'), - axis=k), - name='C_interleaved') + C_interleaved = te.compute( + (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B), + lambda b, x, y, w, z: te.sum( + A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32") + * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"), + axis=k, + ), + name="C_interleaved", + ) # Unpack C - C = te.compute((batches, M, N), - lambda b, x, y: - C_interleaved[b, - x // tile_rows_A, - y // tile_rows_B, - idxm(x, tile_rows_A), - idxm(y, tile_rows_B)].astype(out_dtype), - name="C") + C = te.compute( + (batches, M, N), + lambda b, x, y: C_interleaved[ + b, x // tile_rows_A, y // tile_rows_B, idxm(x, tile_rows_A), idxm(y, tile_rows_B) + ].astype(out_dtype), + name="C", + ) zero = tvm.tir.const(0) else: # No need to pack/unpack - C = te.compute((batches, - M_padded, - N_padded), - lambda b, x, y: - te.sum(A[b, x, k].astype('int32')* - B_interleaved_t[y//tile_rows_B, - k//tile_cols_B, - idxm(y, tile_rows_B), - idxm(k, tile_cols_B)].astype('int32'), - axis=k), - name='C') - zero = tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1] - \ - tvm.tir.const(1, C.dtype) * C[0, M_padded-1, N_padded-1] + C = te.compute( + (batches, M_padded, N_padded), + lambda b, x, y: te.sum( + A[b, x, k].astype("int32") + * B_interleaved_t[ + y // tile_rows_B, k // tile_cols_B, idxm(y, tile_rows_B), idxm(k, tile_cols_B) + ].astype("int32"), + axis=k, + ), + name="C", + ) + zero = ( + tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1] + - tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1] + ) # --- Produce the conv output out_shape = (batches, OH, OW, OC) - out = te.compute(out_shape, lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype), - name='conv2d_gemm_output') + out = te.compute( + out_shape, + lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype), + name="conv2d_gemm_output", + ) return out + def schedule_conv2d_gemm(cfg, s, out, final_out): """ Schedule the conv2d_gemm interleaved strategy """ C = out.op.input_tensors[0] @@ -233,7 +240,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): # Computation(through tensorize) b, xo, yo, xi, yi = C_interleaved.op.axis - outer_gemm, inner_gemm = cfg['reorder_gemm'].apply(s, C_interleaved, [xo, yo]) + outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo]) b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm) s[C_interleaved].parallel(b_outer_gemm_fused) @@ -250,24 +257,31 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): _, M, N = C.shape if is_fast_int8_on_arm(): mmla = mmla_4x4_int8_int8_int32(in_type) - xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(xi, - yi, - x_factor=8, - y_factor=4) + xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile( + xi, yi, x_factor=8, y_factor=4 + ) k_outer, k_inner = s[C_interleaved].split(k, 4) xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4) - s[C_interleaved].reorder(b_outer_gemm_fused, inner_gemm, xi_outer, - yi_outer, k_outer, xi_inner_outer, xi_inner_inner, - yi_inner, k_inner) + s[C_interleaved].reorder( + b_outer_gemm_fused, + inner_gemm, + xi_outer, + yi_outer, + k_outer, + xi_inner_outer, + xi_inner_inner, + yi_inner, + k_inner, + ) s[C_interleaved].tensorize(xi_inner_inner, mmla) s[C_interleaved].unroll(xi_inner_outer) elif is_aarch64_arm(): s[C_interleaved].reorder(yi, xi) K = A_interleaved_input.shape[2] - assert in_type in ['int8', 'uint8'], "Only int8 and uint8 gemm are supported" - unroll = cfg['gemm_quantized_unroll'].val - interleave = cfg['gemm_quantized_interleave'].val + assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported" + unroll = cfg["gemm_quantized_unroll"].val + interleave = cfg["gemm_quantized_interleave"].val gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type) s[C_interleaved].pragma( b_outer_gemm_fused, @@ -276,7 +290,6 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): ) s[C_interleaved].tensorize(yi, gemm) - # Output transform if out != final_out: n, h, w, c = out.op.axis @@ -285,6 +298,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): s[out].vectorize(inner) return s + def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out): """ Schedule the conv2d_gemm hybrid strategy """ C = out.op.input_tensors[0] @@ -293,7 +307,7 @@ def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out): # Computation b, x, y = C.op.axis - k, = C.op.reduce_axis + (k,) = C.op.reduce_axis k_outer, k_inner = s[C].split(k, 16) x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16) s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner) diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 80ed67ca79a5..022e32704b3c 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -24,10 +24,14 @@ from .. import nn from ..nn.conv2d import _get_workload as _get_conv2d_workload from .tensor_intrin import dot_int8_int8_int32 -from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform,\ - schedule_conv2d_gemm, schedule_conv2d_gemm_hybrid +from .conv2d_gemm import ( + compute_conv2d_gemm_without_weight_transform, + schedule_conv2d_gemm, + schedule_conv2d_gemm_hybrid, +) from .arm_utils import is_fast_int8_on_arm + def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): """ Get default int8 schedule config for the workload @@ -41,6 +45,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): cfg, wkl, int32_lanes=2, num_int8_elements=4 ) + def get_tiling_B(interleave_A): """ Compute the tiling information for matrix B in A*B=C """ if is_fast_int8_on_arm(): @@ -129,24 +134,43 @@ def traverse(op): return s -def _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, - dilation, out_dtype, interleave_A): +def _compute_conv2d_NHWC_quantized( + cfg, data, kernel, strides, padding, dilation, out_dtype, interleave_A +): N, IH, IW, IC = get_const_tuple(data.shape) KH, KW, _, OC = get_const_tuple(kernel.shape) tile_rows_B, tile_cols_B = get_tiling_B(interleave_A) kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B) - return compute_conv2d_gemm_without_weight_transform(cfg, - data, kernel, strides, padding, - dilation, out_dtype, (KH, KW), - OC, interleave_A) + return compute_conv2d_gemm_without_weight_transform( + cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC, interleave_A + ) + -def _compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding, - dilation, out_dtype, kernel_size=None, - output_channels=None, interleave_A=False): - return compute_conv2d_gemm_without_weight_transform(cfg, data, B, strides, padding, - dilation, out_dtype, kernel_size, - output_channels, interleave_A) +def _compute_conv2d_NHWC_quantized_without_transform( + cfg, + data, + B, + strides, + padding, + dilation, + out_dtype, + kernel_size=None, + output_channels=None, + interleave_A=False, +): + return compute_conv2d_gemm_without_weight_transform( + cfg, + data, + B, + strides, + padding, + dilation, + out_dtype, + kernel_size, + output_channels, + interleave_A, + ) def _schedule_conv2d_NHWC_quantized(cfg, outs, interleave_A): @@ -178,42 +202,59 @@ def _callback(op): traverse_inline(s, outs[0].op, _callback) return s + #### Interleaved schedules @autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu") def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype): """ Interface for interleaved compute_conv2d_NHWC_quantized""" - return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, - padding, dilation, out_dtype, True) + return _compute_conv2d_NHWC_quantized( + cfg, data, kernel, strides, padding, dilation, out_dtype, True + ) + @autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu") -def compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding, dilation, - out_dtype, kernel_size, output_channels): +def compute_conv2d_NHWC_quantized_without_transform( + cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels +): """ Interface for interleaved compute_conv2d_NHWC_quantized_without_transform""" - return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, - strides, padding, dilation, - out_dtype, kernel_size, - output_channels, True) + return _compute_conv2d_NHWC_quantized_without_transform( + cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels, True + ) + @autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu") def schedule_conv2d_NHWC_quantized(cfg, outs): """ Interface for interleaved schedule_conv2d_NHWC_quantized""" return _schedule_conv2d_NHWC_quantized(cfg, outs, True) + #### Hybrid schedules (A non interleaved, B is interleaved and transposed) @autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid.arm_cpu") def compute_conv2d_NHWC_quantized_hybrid(cfg, data, kernel, strides, padding, dilation, out_dtype): """ Interface for hybrid compute_conv2d_NHWC_quantized""" - return _compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, - padding, dilation, out_dtype, False) + return _compute_conv2d_NHWC_quantized( + cfg, data, kernel, strides, padding, dilation, out_dtype, False + ) + @autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu") -def compute_conv2d_NHWC_quantized_hybrid_without_transform(cfg, data, kernel, strides, padding, - dilation, out_dtype, - kernel_size, output_channels): +def compute_conv2d_NHWC_quantized_hybrid_without_transform( + cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels +): """ Interface for hybrid compute_conv2d_NHWC_quantized_without_transform""" - return _compute_conv2d_NHWC_quantized_without_transform(cfg, data, kernel, strides, padding, - dilation, out_dtype, kernel_size, - output_channels, False) + return _compute_conv2d_NHWC_quantized_without_transform( + cfg, + data, + kernel, + strides, + padding, + dilation, + out_dtype, + kernel_size, + output_channels, + False, + ) + @autotvm.register_topi_schedule("conv2d_NHWC_quantized_hybrid.arm_cpu") def schedule_conv2d_NHWC_quantized_hybrid(cfg, outs): diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index ea55cd99e876..3d1c97139daa 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -417,19 +417,34 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type): idxm = tvm.tir.indexmod k = te.reduce_axis((0, K), "k") - C = te.compute((te.var("m"), te.var("n")), - lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype('int32') * - B[k // 16, y, idxm(k, 16)].astype('int32'), - axis=k), name="C") + C = te.compute( + (te.var("m"), te.var("n")), + lambda x, y: te.sum( + A[k // 16, x, idxm(k, 16)].astype("int32") * B[k // 16, y, idxm(k, 16)].astype("int32"), + axis=k, + ), + name="C", + ) - a_buffer = tvm.tir.decl_buffer(A.shape, dtype=in_type, name="a_buffer", - offset_factor=1, strides=[te.var('sa_1'), te.var('sa_2'), 1]) + a_buffer = tvm.tir.decl_buffer( + A.shape, + dtype=in_type, + name="a_buffer", + offset_factor=1, + strides=[te.var("sa_1"), te.var("sa_2"), 1], + ) - b_buffer = tvm.tir.decl_buffer(B.shape, dtype=in_type, name="b_buffer", - offset_factor=1, strides=[te.var('sb_1'), te.var('sb_2'), 1]) + b_buffer = tvm.tir.decl_buffer( + B.shape, + dtype=in_type, + name="b_buffer", + offset_factor=1, + strides=[te.var("sb_1"), te.var("sb_2"), 1], + ) - c_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="c_buffer", - offset_factor=1, strides=[te.var('sc'), 1]) + c_buffer = tvm.tir.decl_buffer( + C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1] + ) def _intrin_func(ins, outs): def _instr(): @@ -585,13 +600,14 @@ def select_word(vec, lane, dtype_vec): Note that i can vary between 0 and 3 """ # Reinterpret vec_a as 4 int32 words - vec_int32 = tvm.tir.call_intrin('int32x4', 'tir.reinterpret', vec) + vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec) # Broadcast the lane-th word vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane]) # Convert back to uint8x16 - vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, 'tir.reinterpret', vec_int32_shuffled) + vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", vec_int32_shuffled) return vec_int8_broadcast + def mmla_4x4_int8_int8_int32(dtype): """ Int8 4x4 matrix multiplication using sdot/udot instructions @@ -626,32 +642,33 @@ def mmla_4x4_int8_int8_int32(dtype): intrin : TensorIntrin The Arm TensorIntrin that can be used in tensorizing schedule """ - data = te.placeholder((te.var("rows"), 4), dtype, name='data') - kernel = te.placeholder((4, 4), dtype, name='kernel') - dtype_vec = dtype + 'x16' - - k = te.reduce_axis((0, 4), name='k') - C = te.compute((te.var("rows"), 4), - lambda i, j: te.sum(data[i, k].astype('int32') * - kernel[j, k].astype('int32'), - axis=k), name="C") - - aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer", - offset_factor=1, - strides=[te.var('sa'), 1]) - bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer", - offset_factor=1, - strides=[te.var('sb'), 1]) - cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer", - offset_factor=1, - strides=[te.var('sc'), 1]) + data = te.placeholder((te.var("rows"), 4), dtype, name="data") + kernel = te.placeholder((4, 4), dtype, name="kernel") + dtype_vec = dtype + "x16" + + k = te.reduce_axis((0, 4), name="k") + C = te.compute( + (te.var("rows"), 4), + lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, k].astype("int32"), axis=k), + name="C", + ) + + aa_buffer = tvm.tir.decl_buffer( + data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] + ) + bb_buffer = tvm.tir.decl_buffer( + kernel.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1] + ) + cc_buffer = tvm.tir.decl_buffer( + C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1] + ) def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() if index == 1: for i in range(0, 4): - ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x4'))) + ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4"))) return ib.get() vec_a = ins[0].vload([0, 0]) @@ -660,12 +677,15 @@ def _instr(index): # Execute the dot product for i in range(0, 4): - vec_c = outs[0].vload([i, 0], 'int32x4') + vec_c = outs[0].vload([i, 0], "int32x4") vdot = tvm.tir.call_llvm_intrin( - 'int32x4', - 'llvm.aarch64.neon.sdot', - tvm.tir.const(3, 'uint32'), - vec_c, vec_b, vec_aa[i]) + "int32x4", + "llvm.aarch64.neon.sdot", + tvm.tir.const(3, "uint32"), + vec_c, + vec_b, + vec_aa[i], + ) # Store the result ib.emit(outs[0].vstore([i, 0], vdot)) @@ -677,8 +697,12 @@ def _instr(index): buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin( - C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer}, - default_buffer_params=buffer_params) + C.op, + _intrin_func, + binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer}, + default_buffer_params=buffer_params, + ) + def mmla_16x4_int8_int8_int32(dtype, rows): """ @@ -714,32 +738,39 @@ def mmla_16x4_int8_int8_int32(dtype, rows): intrin : TensorIntrin The Arm TensorIntrin that can be used in tensorizing schedule """ - data = te.placeholder((rows, 16), dtype, name='data') - kernel = te.placeholder((4, 16, 4), dtype, name='kernel') - dtype_vec = dtype + 'x16' + data = te.placeholder((rows, 16), dtype, name="data") + kernel = te.placeholder((4, 16, 4), dtype, name="kernel") + dtype_vec = dtype + "x16" idxm = tvm.tir.indexmod - k = te.reduce_axis((0, 16), name='k') - C = te.compute((rows, 16), - lambda i, j: te.sum(data[i, k].astype('int32') * - kernel[k//4, j, idxm(k, 4)].astype('int32'), - axis=k), name="C") - - aa_buffer = tvm.tir.decl_buffer(data.shape, dtype, name="aa_buffer", - offset_factor=1, - strides=[te.var('sa'), 1]) - bb_buffer = tvm.tir.decl_buffer(kernel.shape, dtype, name="bb_buffer", - offset_factor=1, - strides=[te.var('sb0'), te.var('sb1'), 1]) - cc_buffer = tvm.tir.decl_buffer(C.shape, dtype='int32', name="cc_buffer", - offset_factor=1, - strides=[te.var('sc'), 1]) + k = te.reduce_axis((0, 16), name="k") + C = te.compute( + (rows, 16), + lambda i, j: te.sum( + data[i, k].astype("int32") * kernel[k // 4, j, idxm(k, 4)].astype("int32"), axis=k + ), + name="C", + ) + + aa_buffer = tvm.tir.decl_buffer( + data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] + ) + bb_buffer = tvm.tir.decl_buffer( + kernel.shape, + dtype, + name="bb_buffer", + offset_factor=1, + strides=[te.var("sb0"), te.var("sb1"), 1], + ) + cc_buffer = tvm.tir.decl_buffer( + C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1] + ) def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() if index == 1: for i in range(0, rows): - ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 'int32x16'))) + ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x16"))) return ib.get() for k in range(0, rows): @@ -748,14 +779,17 @@ def _instr(index): for j in range(0, 4): for i in range(0, 4): vec_aa = select_word(vec_a, i, dtype_vec) - vec_b = ins[1].vload([i, 4*j, 0], dtype_vec) - vec_c = outs[0].vload([k, 4*j], 'int32x4') + vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec) + vec_c = outs[0].vload([k, 4 * j], "int32x4") vdot = tvm.tir.call_llvm_intrin( - 'int32x4', - 'llvm.aarch64.neon.sdot', - tvm.tir.const(3, 'uint32'), - vec_c, vec_b, vec_aa) - ib.emit(outs[0].vstore([k, 4*j], vdot)) + "int32x4", + "llvm.aarch64.neon.sdot", + tvm.tir.const(3, "uint32"), + vec_c, + vec_b, + vec_aa, + ) + ib.emit(outs[0].vstore([k, 4 * j], vdot)) return ib.get() # body, reset, update @@ -763,8 +797,12 @@ def _instr(index): buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin( - C.op, _intrin_func, binds={data:aa_buffer, kernel:bb_buffer, C:cc_buffer}, - default_buffer_params=buffer_params) + C.op, + _intrin_func, + binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer}, + default_buffer_params=buffer_params, + ) + def _q_multiply_shift_arm(op): """ From 7c8c8671ccf0a0efe9aaee019af743903f0ad728 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Fri, 18 Sep 2020 14:56:21 +0100 Subject: [PATCH 04/10] Fix python linting Change-Id: I5fb8a2ae4467a87bd3470f6b3753c074f9b7cc78 --- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index d669fa695e58..eeea3d785c7c 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -33,7 +33,7 @@ def interleave_transpose_B(inputs, data, kernel, interleave_A): - """ Return the new placeholder and the expression that represent + """Return the new placeholder and the expression that represent the matrix B transposed and interleaved""" assert ( From 8168589019d2893f49a335fd23497eee61a7387f Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 24 Sep 2020 21:59:41 +0100 Subject: [PATCH 05/10] Addressing review comments Change-Id: I284b1f2c121051e672f548d6c6ee2a3267854e31 --- python/tvm/relay/op/strategy/arm_cpu.py | 32 +++++----- python/tvm/topi/arm_cpu/arm_utils.py | 47 ++++++++++++++- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 52 +++++++++------- python/tvm/topi/arm_cpu/conv2d_gemm.py | 66 ++++++++++++-------- python/tvm/topi/arm_cpu/conv2d_int8.py | 70 ++++++++++------------ python/tvm/topi/arm_cpu/tensor_intrin.py | 20 ++++++- 6 files changed, 181 insertions(+), 106 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 9fe6e28e6f84..6759a54d0b80 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -136,21 +136,21 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): ) elif kernel_layout == "HWIO": is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm() - has_dot_prod = topi.arm_cpu.arm_utils.is_fast_int8_on_arm() + has_dot_prod = topi.arm_cpu.arm_utils.is_dotprod_available() if has_dot_prod and data.dtype in ["int8", "uint8"]: strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid), - name="conv2d_NHWC_quantized_hybrid.arm_cpu", + wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native), + name="conv2d_NHWC_quantized_native.arm_cpu", ) if is_aarch64 and data.dtype in ["int8", "uint8"]: strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized.arm_cpu", + wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved), + name="conv2d_NHWC_quantized_interleaved.arm_cpu", ) if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]): - # TODO + # TODO(@giuseros) # This strategy errors out for quantized data types when tuning. # Let's use this only for non-aarch64 or non-quantized cases strategy.add_implementation( @@ -337,18 +337,18 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ data = inputs[0] strategy = _op.OpStrategy() - interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform - hybrid_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_hybrid_without_transform + interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved_without_transform + native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native_without_transform if layout == "NHWC" and data.dtype in ["int8", "uint8"]: strategy.add_implementation( - wrap_compute_conv2d_gemm(interleaved_compute), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized_without_transform.arm_cpu", + wrap_compute_conv2d_gemm(native_compute), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native), + name="conv2d_NHWC_quantized_native_without_transform.arm_cpu", ) strategy.add_implementation( - wrap_compute_conv2d_gemm(hybrid_compute), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_hybrid), - name="conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu", + wrap_compute_conv2d_gemm(interleaved_compute), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved), + name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu", ) else: raise RuntimeError( diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py index b9c7010d6ee3..c18312a7045c 100644 --- a/python/tvm/topi/arm_cpu/arm_utils.py +++ b/python/tvm/topi/arm_cpu/arm_utils.py @@ -20,7 +20,7 @@ import tvm -def is_fast_int8_on_arm(): +def is_dotprod_available(): """ Checks whether the hardware has support for fast Int8 arithmetic operations. """ target = tvm.target.Target.current(allow_none=False) return "+v8.2a" in target.mattr and "+dotprod" in target.mattr @@ -30,3 +30,48 @@ def is_aarch64_arm(): """ Checks whether we are compiling for an AArch64 target. """ target = tvm.target.Target.current(allow_none=False) return "aarch64" in target.attrs.get("mtriple", "") + + +def get_tiling_B_interleaved_t(interleave_A): + """ Compute the tiling information for matrix B', where B' + is the transposed and interleaved version of matrix B in C=A*B. + + The tiling information is chosen to maximize register usage during the + tile computation. + + Please refer to: + - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product + - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h + In order to have more information + + Parameters + ---------- + interleave_A: bool + determines if A is expected to be interleaved + + Returns + ---------- + tile_rows_B: the output tile rows of B' + tile_cols_B: the output tile columns of B' + """ + if is_dotprod_available(): + # The number of tile rows of B' vary depending on the + # strategy: + # * If we are interleaving A, then we select 12 columns from B'(i.e., + # 12 rows from B). + # * If we are not interleaving A, then we select 16 columns from B'(i.e., + # 16 rows from B). + tile_rows_B = 12 if interleave_A else 16 + + # Dot product instruction groups 2 (u)int16x8 vectors in + # groups of 4 and compute the dot product among those groups + # This means that the number of columns in a tile of B' (i.e., the + # rows of the original matrix B) need to be 4. + tile_cols_B = 4 + else: + # If dot product is not available, A must be interleaved. In this case + # we load 4 rows of B' (i.e., 4 columns of B). Each of them will contain 16 elements + tile_rows_B = 4 + tile_cols_B = 16 + + return tile_rows_B, tile_cols_B diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index eeea3d785c7c..f81d8f593390 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -27,15 +27,31 @@ from ..nn import conv2d_alter_layout from ..util import get_const_tuple from ..x86.conv2d import _get_default_config as _get_x86_default_config -from .arm_utils import is_fast_int8_on_arm +from .arm_utils import get_tiling_B_interleaved_t logger = logging.getLogger("topi") -def interleave_transpose_B(inputs, data, kernel, interleave_A): - """Return the new placeholder and the expression that represent - the matrix B transposed and interleaved""" - +def interleave_transpose_weights(inputs, data, kernel, interleave_A): + """ Transform the weight matrix by reshaping, interleaving and transposing it + + Parameters + ---------- + inputs : tvm.relay.Expr + Grouped input symbols + data : + Input shape and dtype + kernel : + Input shape and dtype + interleave_A: indicates if we expect matrix A to be interleaved + + Returns + ---------- + new_kernel : tvm.te.placeholder + A placeholder with the new shape + new_kernel_expr : tvm.relay.Expr + The relay expression of the weights + """ assert ( data.dtype == "int8" and kernel.dtype == "int8" @@ -47,12 +63,8 @@ def interleave_transpose_B(inputs, data, kernel, interleave_A): K = KH * KW * IC N = OC - if is_fast_int8_on_arm(): - tile_rows_B = 12 if interleave_A else 16 - tile_cols_B = 4 - else: - tile_rows_B = 4 - tile_cols_B = 16 + # Get tiling information for the interleaved transposed version of B + tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A) pad_K = 0 pad_N = 0 @@ -321,12 +333,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): ) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs) - if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu": + if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu": assert data_layout == "NHWC" and kernel_layout == "HWIO" - KH, KW, IC, OC = get_const_tuple(kernel.shape) - N = OC - new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu" - new_kernel, new_kernel_expr = interleave_transpose_B( + KH, KW, _, OC = get_const_tuple(kernel.shape) + new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu" + new_kernel, new_kernel_expr = interleave_transpose_weights( inputs, data, kernel, interleave_A=True ) new_workload = autotvm.task.args_to_workload( @@ -338,12 +349,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): return relay.nn.contrib_conv2d_gemm_without_weight_transform( inputs[0], new_kernel_expr, **new_attrs ) - if topi_tmpl == "conv2d_NHWC_quantized_hybrid.arm_cpu": + if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu": assert data_layout == "NHWC" and kernel_layout == "HWIO" - KH, KW, IC, OC = get_const_tuple(kernel.shape) - N = OC - new_workload_name = "conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu" - new_kernel, new_kernel_expr = interleave_transpose_B( + KH, KW, _, OC = get_const_tuple(kernel.shape) + new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu" + new_kernel, new_kernel_expr = interleave_transpose_weights( inputs, data, kernel, interleave_A=False ) new_workload = autotvm.task.args_to_workload( diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py index 09c53001963f..6896fd03289a 100644 --- a/python/tvm/topi/arm_cpu/conv2d_gemm.py +++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py @@ -29,7 +29,7 @@ mmla_4x4_int8_int8_int32, mmla_16x4_int8_int8_int32, ) -from .arm_utils import is_aarch64_arm, is_fast_int8_on_arm +from .arm_utils import is_aarch64_arm, is_dotprod_available def configure_knobs(cfg, M, K): @@ -48,7 +48,7 @@ def configure_knobs(cfg, M, K): cfg["reorder_gemm"] = ReorderEntity([0, 1]) cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"]) - if not is_fast_int8_on_arm(): + if not is_dotprod_available(): cfg.define_knob("gemm_quantized_unroll", [True, False]) cfg.define_knob("gemm_quantized_interleave", [True, False]) @@ -76,8 +76,7 @@ def compute_conv2d_gemm_without_weight_transform( KH, KW = get_const_tuple(kernel_size) OC = get_const_int(output_channels) - - K_AREA = KH * KW + kernel_area = KH * KW if isinstance(dilation, int): dilation_h = dilation_w = dilation @@ -101,13 +100,13 @@ def compute_conv2d_gemm_without_weight_transform( else: data_pad = data - # --- Im2col + # Im2col M = OH * OW - K = IC * K_AREA + K = IC * kernel_area N = OC A_shape = (batches, M, K) - if K_AREA == 1: + if kernel_area == 1: A = tvm.topi.reshape(data_pad, A_shape) else: A = te.compute( @@ -121,35 +120,48 @@ def compute_conv2d_gemm_without_weight_transform( name="data_im2col", ) - # --- Pad if necessary + # Pad if necessary N_transformed = B_interleaved_t.shape[0] tile_rows_B = B_interleaved_t.shape[2] tile_cols_B = B_interleaved_t.shape[3] - if is_fast_int8_on_arm() and interleave_A: + # Select the tiling strategy for A. + # The tiling information is chosen to maximize register usage during + # the tile computation. + # + # Please refer to: + # - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product + # - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h + # In order to have more information + # + if is_dotprod_available() and interleave_A: + # If dot product has been enabled, and we are interleaving A + # tile size should be 8x4 tile_rows_A = 8 tile_cols_A = 4 else: + # If either there is no dot product or if we are using a native strategy + # tile size should be 4x16 tile_rows_A = 4 tile_cols_A = 16 - pad_m = 0 - pad_k = 0 + pad_M = 0 + pad_K = 0 if M % tile_rows_A != 0: - pad_m = tile_rows_A - (M % tile_rows_A) + pad_M = tile_rows_A - (M % tile_rows_A) if K % tile_cols_A != 0: - pad_k = tile_cols_A - (K % tile_cols_A) + pad_K = tile_cols_A - (K % tile_cols_A) - M_padded = M + pad_m - K_padded = K + pad_k + M_padded = M + pad_M + K_padded = K + pad_K N_padded = N_transformed * tile_rows_B pad_before = (0, 0, 0) - pad_after = (0, pad_m, pad_k) + pad_after = (0, pad_M, pad_K) - if pad_m != 0 or pad_k != 0: + if pad_M != 0 or pad_K != 0: A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded") idxm = tvm.tir.indexmod @@ -159,13 +171,13 @@ def compute_conv2d_gemm_without_weight_transform( # Configuration space configure_knobs(cfg, M_padded, K_padded) - # Pack A + # Pack the input data A_interleaved = te.compute( (batches, M_padded // tile_rows_A, K_padded // tile_cols_A, tile_rows_A, tile_cols_A), lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y], name="A_interleaved", ) - # Compute C + # Execute GEMM C_interleaved = te.compute( (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B), lambda b, x, y, w, z: te.sum( @@ -175,7 +187,7 @@ def compute_conv2d_gemm_without_weight_transform( ), name="C_interleaved", ) - # Unpack C + # Unpack the result C = te.compute( (batches, M, N), lambda b, x, y: C_interleaved[ @@ -185,7 +197,7 @@ def compute_conv2d_gemm_without_weight_transform( ) zero = tvm.tir.const(0) else: - # No need to pack/unpack + # No need to pack/unpack, execute GEMM directly C = te.compute( (batches, M_padded, N_padded), lambda b, x, y: te.sum( @@ -197,12 +209,16 @@ def compute_conv2d_gemm_without_weight_transform( ), name="C", ) + + # We need to ensure that infer bound pass does not remove the padding + # which is necessary for the tensorizations to work. So we need to + # add a dummy reference to the padding area of the result zero = ( tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1] - tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1] ) - # --- Produce the conv output + # Reshape the result into a convolution output out_shape = (batches, OH, OW, OC) out = te.compute( out_shape, @@ -212,7 +228,7 @@ def compute_conv2d_gemm_without_weight_transform( return out -def schedule_conv2d_gemm(cfg, s, out, final_out): +def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out): """ Schedule the conv2d_gemm interleaved strategy """ C = out.op.input_tensors[0] C_interleaved = C.op.input_tensors[0] @@ -255,7 +271,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): k = C_interleaved.op.reduce_axis[0] _, M, N = C.shape - if is_fast_int8_on_arm(): + if is_dotprod_available(): mmla = mmla_4x4_int8_int8_int32(in_type) xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile( xi, yi, x_factor=8, y_factor=4 @@ -299,7 +315,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): return s -def schedule_conv2d_gemm_hybrid(cfg, s, out, final_out): +def schedule_conv2d_gemm_native(cfg, s, out, final_out): """ Schedule the conv2d_gemm hybrid strategy """ C = out.op.input_tensors[0] A = C.op.input_tensors[0] diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 022e32704b3c..43fe80178bd3 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -26,10 +26,10 @@ from .tensor_intrin import dot_int8_int8_int32 from .conv2d_gemm import ( compute_conv2d_gemm_without_weight_transform, - schedule_conv2d_gemm, - schedule_conv2d_gemm_hybrid, + schedule_conv2d_gemm_interleaved, + schedule_conv2d_gemm_native, ) -from .arm_utils import is_fast_int8_on_arm +from .arm_utils import get_tiling_B_interleaved_t def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): @@ -46,20 +46,6 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): ) -def get_tiling_B(interleave_A): - """ Compute the tiling information for matrix B in A*B=C """ - if is_fast_int8_on_arm(): - # Two options in case dotprod is enabled - tile_cols_B = 4 - tile_rows_B = 12 if interleave_A else 16 - else: - # Single option if dotprod is not available - tile_rows_B = 4 - tile_cols_B = 16 - - return tile_rows_B, tile_cols_B - - @autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu") def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): """Compute conv2d int8 with NCHWc layout""" @@ -139,7 +125,7 @@ def _compute_conv2d_NHWC_quantized( ): N, IH, IW, IC = get_const_tuple(data.shape) KH, KW, _, OC = get_const_tuple(kernel.shape) - tile_rows_B, tile_cols_B = get_tiling_B(interleave_A) + tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A) kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B) return compute_conv2d_gemm_without_weight_transform( @@ -189,9 +175,9 @@ def _callback(op): if op.name == "conv2d_gemm_output": conv_out = op.output(0) if interleave_A: - schedule_conv2d_gemm(cfg, s, conv_out, out) + schedule_conv2d_gemm_interleaved(cfg, s, conv_out, out) else: - schedule_conv2d_gemm_hybrid(cfg, s, conv_out, out) + schedule_conv2d_gemm_native(cfg, s, conv_out, out) if out != conv_out: s[conv_out].compute_at(s[out], inner) else: @@ -203,45 +189,49 @@ def _callback(op): return s -#### Interleaved schedules -@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu") -def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype): - """ Interface for interleaved compute_conv2d_NHWC_quantized""" +# Interleaved schedules: those schedule will interleave the input data. The +# weights are interleaved and transposed +@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved.arm_cpu") +def compute_conv2d_NHWC_quantized_interleaved( + cfg, data, kernel, strides, padding, dilation, out_dtype +): + """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved""" return _compute_conv2d_NHWC_quantized( cfg, data, kernel, strides, padding, dilation, out_dtype, True ) -@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu") -def compute_conv2d_NHWC_quantized_without_transform( +@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu") +def compute_conv2d_NHWC_quantized_interleaved_without_transform( cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels ): - """ Interface for interleaved compute_conv2d_NHWC_quantized_without_transform""" + """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved_without_transform""" return _compute_conv2d_NHWC_quantized_without_transform( cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels, True ) -@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu") -def schedule_conv2d_NHWC_quantized(cfg, outs): - """ Interface for interleaved schedule_conv2d_NHWC_quantized""" +@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved.arm_cpu") +def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs): + """ Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved""" return _schedule_conv2d_NHWC_quantized(cfg, outs, True) -#### Hybrid schedules (A non interleaved, B is interleaved and transposed) -@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid.arm_cpu") -def compute_conv2d_NHWC_quantized_hybrid(cfg, data, kernel, strides, padding, dilation, out_dtype): - """ Interface for hybrid compute_conv2d_NHWC_quantized""" +# Native schedules: those schedule won't interleave A (which is left in its native form). +# The weights are interleaved and transposed +@autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu") +def compute_conv2d_NHWC_quantized_native(cfg, data, kernel, strides, padding, dilation, out_dtype): + """ Interface for native compute_conv2d_NHWC_quantized""" return _compute_conv2d_NHWC_quantized( cfg, data, kernel, strides, padding, dilation, out_dtype, False ) -@autotvm.register_topi_compute("conv2d_NHWC_quantized_hybrid_without_transform.arm_cpu") -def compute_conv2d_NHWC_quantized_hybrid_without_transform( +@autotvm.register_topi_compute("conv2d_NHWC_quantized_native_without_transform.arm_cpu") +def compute_conv2d_NHWC_quantized_native_without_transform( cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels ): - """ Interface for hybrid compute_conv2d_NHWC_quantized_without_transform""" + """ Interface for compute_conv2d_NHWC_quantized_native_without_transform""" return _compute_conv2d_NHWC_quantized_without_transform( cfg, data, @@ -256,7 +246,7 @@ def compute_conv2d_NHWC_quantized_hybrid_without_transform( ) -@autotvm.register_topi_schedule("conv2d_NHWC_quantized_hybrid.arm_cpu") -def schedule_conv2d_NHWC_quantized_hybrid(cfg, outs): - """ Interface for hybrid schedule_conv2d_NHWC_quantized""" +@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native.arm_cpu") +def schedule_conv2d_NHWC_quantized_native(cfg, outs): + """ Interface for native schedule_conv2d_NHWC_quantized""" return _schedule_conv2d_NHWC_quantized(cfg, outs, False) diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index 3d1c97139daa..eaf9c209d6dd 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -594,10 +594,24 @@ def select_word(vec, lane, dtype_vec): The pseudo-code for this operation is: v = [x0, ..., x15] - vsub(i) = v[i:i+3] - replicated_v(i) = [vsub(i), vsub(i), vsub(i), vsub(i)] + vsub(lane) = v[4*lane:4*lane+3] + replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)] - Note that i can vary between 0 and 3 + Note that 0<=lane<4 + + Parameters + ---------- + vec: tvm.tir.Expr + int8x16 vector expression + lane: int + vector lane we want to replicate + dtype_vec: str + vector data type (e.g., int8x16) + + Returns + ---------- + output: tvm.tir.Expr + replicated vector """ # Reinterpret vec_a as 4 int32 words vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec) From 01d827a0b0a7ac894fd5c7e7a32adaa9751dc89b Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Fri, 25 Sep 2020 12:15:26 +0100 Subject: [PATCH 06/10] Fix black linting issues Change-Id: I1813b0226b536aedee0dce9eeeba27aa2d95518b --- python/tvm/topi/arm_cpu/arm_utils.py | 2 +- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py index c18312a7045c..f5c94266bec6 100644 --- a/python/tvm/topi/arm_cpu/arm_utils.py +++ b/python/tvm/topi/arm_cpu/arm_utils.py @@ -33,7 +33,7 @@ def is_aarch64_arm(): def get_tiling_B_interleaved_t(interleave_A): - """ Compute the tiling information for matrix B', where B' + """Compute the tiling information for matrix B', where B' is the transposed and interleaved version of matrix B in C=A*B. The tiling information is chosen to maximize register usage during the diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index f81d8f593390..a64bc413e0c6 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -33,7 +33,7 @@ def interleave_transpose_weights(inputs, data, kernel, interleave_A): - """ Transform the weight matrix by reshaping, interleaving and transposing it + """Transform the weight matrix by reshaping, interleaving and transposing it Parameters ---------- From 82ce6474c645a92f60da48701a12e5219437e9b5 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Fri, 25 Sep 2020 16:24:18 +0100 Subject: [PATCH 07/10] Fixing failing test and adding tests for dot-product compilation Change-Id: Ic040722abd5538fccb85af4de922394c939e7000 --- python/tvm/topi/arm_cpu/tensor_intrin.py | 15 ++- .../python/relay/test_pass_alter_op_layout.py | 2 +- .../topi/python/test_topi_conv2d_int8.py | 105 +++++++++++------- 3 files changed, 70 insertions(+), 52 deletions(-) diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index eaf9c209d6dd..79b9bf8fd732 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -677,6 +677,8 @@ def mmla_4x4_int8_int8_int32(dtype): C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1] ) + llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot" + def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() @@ -685,7 +687,7 @@ def _instr(index): ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4"))) return ib.get() - vec_a = ins[0].vload([0, 0]) + vec_a = ins[0].vload([0, 0], dtype_vec) vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)] vec_b = ins[1].vload([0, 0], dtype_vec) @@ -693,12 +695,7 @@ def _instr(index): for i in range(0, 4): vec_c = outs[0].vload([i, 0], "int32x4") vdot = tvm.tir.call_llvm_intrin( - "int32x4", - "llvm.aarch64.neon.sdot", - tvm.tir.const(3, "uint32"), - vec_c, - vec_b, - vec_aa[i], + "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_b, vec_aa[i], ) # Store the result @@ -779,6 +776,8 @@ def mmla_16x4_int8_int8_int32(dtype, rows): C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1] ) + llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot" + def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() @@ -797,7 +796,7 @@ def _instr(index): vec_c = outs[0].vload([k, 4 * j], "int32x4") vdot = tvm.tir.call_llvm_intrin( "int32x4", - "llvm.aarch64.neon.sdot", + llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_b, diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py index 7b242c479bf1..ff6997e2f991 100644 --- a/tests/python/relay/test_pass_alter_op_layout.py +++ b/tests/python/relay/test_pass_alter_op_layout.py @@ -1016,7 +1016,7 @@ def _query_inside(self, target, workload): def update(self, target, workload, cfg): key = (str(target), workload) assert workload[2][1] == expected_workload_shape - assert workload[0] == "conv2d_NHWC_quantized_without_transform.arm_cpu" + assert workload[0] == "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu" self.memory[key] = cfg def alter_conv2d(attrs, inputs, tinfos, out_type): diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index 238517e5ed75..b2f9835c3d66 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -56,48 +56,67 @@ def compile_conv2d_NHWC_gemm_int8_arm( W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") bias = te.placeholder((num_filter,), name="bias", dtype="int8") dtype = "int32" - device = "llvm --device arm_cpu --mtriple aarch64-linux-gnu" - - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) - return - print("Compiling on arm AArch64 target: %s" % device) - with tvm.target.Target(device): - assert is_aarch64_arm(), "AArch64 target not recognized" - - C = topi.arm_cpu.compute_conv2d_NHWC_quantized( - A, W, (stride, stride), padding, (dilation, dilation), dtype - ) + devices = [ + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu", + topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + ), + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", + topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + ), + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", + topi.arm_cpu.compute_conv2d_NHWC_quantized_native, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_native, + ), + ] + + for device_tuple in devices: + device = device_tuple[0] + compute = device_tuple[1] + schedule = device_tuple[2] + + ctx = tvm.context(device, 0) + if not tvm.testing.device_enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Compiling on arm AArch64 target: %s" % device) + with tvm.target.Target(device): + assert is_aarch64_arm(), "AArch64 target not recognized" + + C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) + if add_bias: + C = topi.add(C, bias) + if add_relu: + C = topi.nn.relu(C) + s = schedule([C]) + if add_bias: - C = topi.add(C, bias) - if add_relu: - C = topi.nn.relu(C) - s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C]) - - if add_bias: - tvm.build( - s, - [A, W, bias, C], - device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func = tvm.build( - s, - [A, W, bias, C], - device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - else: - func = tvm.build( - s, - [A, W, C], - device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) + tvm.build( + s, + [A, W, bias, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + ) + func = tvm.build( + s, + [A, W, bias, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + ) + else: + func = tvm.build( + s, + [A, W, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + ) def verify_conv2d_NHWC_gemm_int8( @@ -155,14 +174,14 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.Target(device): - C = topi.arm_cpu.compute_conv2d_NHWC_quantized( + C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved( A, W, (stride, stride), padding, (dilation, dilation), dtype ) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C]) + s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) From 16b9c6e2eb0697dacd004c3a450e152b7f4234aa Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Mon, 28 Sep 2020 15:14:10 +0100 Subject: [PATCH 08/10] Fixing linting and review comments Change-Id: If09e3baa514c85dc78d3c27c2ac2fa2e01773d89 --- python/tvm/topi/arm_cpu/arm_utils.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py index f5c94266bec6..fb47b29d6f6a 100644 --- a/python/tvm/topi/arm_cpu/arm_utils.py +++ b/python/tvm/topi/arm_cpu/arm_utils.py @@ -17,13 +17,36 @@ # pylint: disable=invalid-name,unused-variable,unused-argument,no-member """Arm target utility functions""" +import re import tvm +def get_arch_version(target_mattr): + """ Parse the LLVM target -mattr, and return + the architecture version in a decimal representation + (e.g., if -mattr=v8.4a, return 8.4) + """ + + arch_version = 8.0 + m = re.compile(r"\+v(.*)\.(.*)a") + for attr in target_mattr: + match_obj = m.match(attr) + if match_obj: + major = int(match_obj.group(1)) + minor = int(match_obj.group(2)) + decimal = 10 + if minor >= 10: + decimal = 100 + arch_version = major + float(minor) / decimal + + return arch_version + + def is_dotprod_available(): """ Checks whether the hardware has support for fast Int8 arithmetic operations. """ target = tvm.target.Target.current(allow_none=False) - return "+v8.2a" in target.mattr and "+dotprod" in target.mattr + arch_version = get_arch_version(target.mattr) + return arch_version >= 8.4 or ((arch_version in (8.2, 8.3)) and "+dotprod" in target.mattr) def is_aarch64_arm(): From 8fd794a66a3bea14f67e137c5d8e1d421502cda3 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Tue, 29 Sep 2020 14:50:05 +0100 Subject: [PATCH 09/10] Fixing black linting and address comments Change-Id: I857b28b6f9b23307d8c1eebc509de6ad2783c756 --- python/tvm/topi/arm_cpu/arm_utils.py | 2 +- python/tvm/topi/arm_cpu/conv2d_gemm.py | 12 +-- python/tvm/topi/arm_cpu/tensor_intrin.py | 111 +++++++++++++++++------ 3 files changed, 88 insertions(+), 37 deletions(-) diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py index fb47b29d6f6a..7e0f566b96f4 100644 --- a/python/tvm/topi/arm_cpu/arm_utils.py +++ b/python/tvm/topi/arm_cpu/arm_utils.py @@ -22,7 +22,7 @@ def get_arch_version(target_mattr): - """ Parse the LLVM target -mattr, and return + """Parse the LLVM target -mattr, and return the architecture version in a decimal representation (e.g., if -mattr=v8.4a, return 8.4) """ diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py index 6896fd03289a..b40fb89b5d33 100644 --- a/python/tvm/topi/arm_cpu/conv2d_gemm.py +++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py @@ -26,8 +26,8 @@ from .tensor_intrin import ( gemm_quantized, gemm_quantized_impl, - mmla_4x4_int8_int8_int32, - mmla_16x4_int8_int8_int32, + gemm_acc_4x4_int8_int8_int32, + gemm_acc_nx16_int8_int8_int32, ) from .arm_utils import is_aarch64_arm, is_dotprod_available @@ -272,7 +272,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out): k = C_interleaved.op.reduce_axis[0] _, M, N = C.shape if is_dotprod_available(): - mmla = mmla_4x4_int8_int8_int32(in_type) + gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type) xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile( xi, yi, x_factor=8, y_factor=4 ) @@ -289,7 +289,7 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out): yi_inner, k_inner, ) - s[C_interleaved].tensorize(xi_inner_inner, mmla) + s[C_interleaved].tensorize(xi_inner_inner, gemm_acc) s[C_interleaved].unroll(xi_inner_outer) elif is_aarch64_arm(): @@ -327,9 +327,9 @@ def schedule_conv2d_gemm_native(cfg, s, out, final_out): k_outer, k_inner = s[C].split(k, 16) x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16) s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner) - mmla = mmla_16x4_int8_int8_int32(in_type, rows=1) + gemm_acc = gemm_acc_nx16_int8_int8_int32(in_type, rows=1) s[C].unroll(x_inner) - s[C].tensorize(y_inner, mmla) + s[C].tensorize(y_inner, gemm_acc) s[C].parallel(x_outer) # Input transform diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index 79b9bf8fd732..9ed4c591da0f 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -622,28 +622,28 @@ def select_word(vec, lane, dtype_vec): return vec_int8_broadcast -def mmla_4x4_int8_int8_int32(dtype): +def gemm_acc_4x4_int8_int8_int32(dtype): """ - Int8 4x4 matrix multiplication using sdot/udot instructions - This function takes two arrays of int8 datatype -- A[4][4] and - B[4][4] and produces a 4x4 matrix which is equal to A*B + Int8 4x4 matrix multiplication and accumulation using sdot/udot + instructions. This function takes two arrays of int8 datatype + -- A[4][4] and B[4][4] and produces a 4x4 matrix + which is equal to A*B. + The pseudo code is as follows. .. code-block:: c - void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 output[4][4]){ + void gemm_acc_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 C[4][4]){ for (int i = 0; i < 4; i++){ for (int j = 0; i < 4; i++){ - out[i][j] = 0; for (int k = 0; k < 4; k++){ - out[i][j] += A[i][k] * B[j][k] + C[i][j] += A[i][k] * B[j][k] } } } Notes: * The rows of matrix B are transposed - * Matrix A is interleaved This function returns a TensorIntrin that can be used to tensorize a schedule. Parameters @@ -656,22 +656,25 @@ def mmla_4x4_int8_int8_int32(dtype): intrin : TensorIntrin The Arm TensorIntrin that can be used in tensorizing schedule """ - data = te.placeholder((te.var("rows"), 4), dtype, name="data") - kernel = te.placeholder((4, 4), dtype, name="kernel") + # This needs to be a variable number of "rows" since TVM + # "thinks" I only need to compute one row because of + # padding + A = te.placeholder((te.var("rows"), 4), dtype, name="data") + B = te.placeholder((4, 4), dtype, name="kernel") dtype_vec = dtype + "x16" k = te.reduce_axis((0, 4), name="k") C = te.compute( (te.var("rows"), 4), - lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, k].astype("int32"), axis=k), + lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k), name="C", ) aa_buffer = tvm.tir.decl_buffer( - data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] + A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] ) bb_buffer = tvm.tir.decl_buffer( - kernel.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1] + B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1] ) cc_buffer = tvm.tir.decl_buffer( C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1] @@ -686,16 +689,49 @@ def _instr(index): for i in range(0, 4): ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4"))) return ib.get() - + # Load all the elements of tile A. + # vec_a = [a, b, c, d, + # e, f, g, h, + # i, l, m, n, + # o, p, q, r,]; vec_a = ins[0].vload([0, 0], dtype_vec) + + # Replicate 4 times the i-th row of A. For instance, + # vec_a[0] = [a, b, c, d, + # a, b, c, d, + # a, b, c, d, + # a, b, c, d,]; vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)] + + # Load all the elements of B. Remember that B + # is transposed: + # vec_b = [0, 4, 8, 12, + # 1, 5, 9, 13, + # 2, 6, 10, 14, + # 3, 7, 11, 15,]; vec_b = ins[1].vload([0, 0], dtype_vec) # Execute the dot product for i in range(0, 4): vec_c = outs[0].vload([i, 0], "int32x4") + # Compute the product between the i-th row of A + # and all the rows of B. Remember that sdot/udot + # subdive the input vectors in 16 elements + # and then take the dot product among each group. + # The result is stored in a int32x4 register + # + # For instance, for i=0, we have: + # sdot(vec_aa[0], vec_b) = [a*0+b*4+c*8+d*12, + # a*1+b*5+c*9+d*13, + # a*2+b*6+c*10+d*14, + # a*3+b*7+c*11+d*15] vdot = tvm.tir.call_llvm_intrin( - "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, vec_b, vec_aa[i], + "int32x4", + llvm_intrin, + tvm.tir.const(3, "uint32"), + vec_c, + vec_b, + vec_aa[i], ) # Store the result @@ -710,33 +746,36 @@ def _instr(index): return te.decl_tensor_intrin( C.op, _intrin_func, - binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer}, + binds={A: aa_buffer, B: bb_buffer, C: cc_buffer}, default_buffer_params=buffer_params, ) -def mmla_16x4_int8_int8_int32(dtype, rows): +def gemm_acc_nx16_int8_int8_int32(dtype, rows): """ - Int8 16x4 matrix multiplication using sdot/udot instructions + Int8 16x4 matrix multiplication and accumulation using sdot/udot instructions This function takes two arrays of int8 datatype -- A[rows][4] and B[4][16] and produces a rowsx16 matrix which is equal to A*B The pseudo code is as follows. .. code-block:: c - void mmla_16x4_int8_int8_int32(int8 A[rows][4], int8 B[4][16], int32 output[rows][16]){ + void mmla_16x4_int8_int8_int32(int8 A[rows][16], int8 B[4][16][4], int32 output[rows][16]){ for (int i = 0; i < rows; i++){ for (int j = 0; i < 16; i++){ - out[i][j] = 0; - for (int k = 0; k < 4; k++){ - out[i][j] += A[i][k] * B[j][k] + for (int k = 0; k < 16; k++){ + out[i][j] += A[i][k] * B[k//4][j][k%4] } + } } } Notes: * The rows of matrix B are transposed - * A is not interleaved, but used in its native form + * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16 + we need 4 tiles of B to compute a single row of the output. The first 4 values of + k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on + This function returns a TensorIntrin that can be used to tensorize a schedule. Parameters @@ -749,24 +788,24 @@ def mmla_16x4_int8_int8_int32(dtype, rows): intrin : TensorIntrin The Arm TensorIntrin that can be used in tensorizing schedule """ - data = te.placeholder((rows, 16), dtype, name="data") - kernel = te.placeholder((4, 16, 4), dtype, name="kernel") + A = te.placeholder((rows, 16), dtype, name="data") + B = te.placeholder((4, 16, 4), dtype, name="kernel") dtype_vec = dtype + "x16" idxm = tvm.tir.indexmod k = te.reduce_axis((0, 16), name="k") C = te.compute( (rows, 16), lambda i, j: te.sum( - data[i, k].astype("int32") * kernel[k // 4, j, idxm(k, 4)].astype("int32"), axis=k + A[i, k].astype("int32") * B[k // 4, j, idxm(k, 4)].astype("int32"), axis=k ), name="C", ) aa_buffer = tvm.tir.decl_buffer( - data.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] + A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] ) bb_buffer = tvm.tir.decl_buffer( - kernel.shape, + B.shape, dtype, name="bb_buffer", offset_factor=1, @@ -785,14 +824,26 @@ def _instr(index): for i in range(0, rows): ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x16"))) return ib.get() - + # Iterate on the number of rows of the output for k in range(0, rows): + # Load 16 elements of A + # vec_a = [a, b, c, e, f, g, h, i, l, m, n, o, p, q, r,]; vec_a = ins[0].vload([k, 0], dtype_vec) + # Iterate over each column of the output for j in range(0, 4): + # Accumulate over each of the 4 (16x4) tiles contained in B for i in range(0, 4): + # As before, replicate a single 4-element group of A vec_aa = select_word(vec_a, i, dtype_vec) + # Load 4 rows (each rows with 4 elements) from B + # vec_b = [0, 16, 32, 48, + # 1, 17, 33, 49, + # 2, 18, 34, 50, + # 3, 19, 35, 51,]; vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec) + # Store the result of the accumulation in the + # correct part of the output vec_c = outs[0].vload([k, 4 * j], "int32x4") vdot = tvm.tir.call_llvm_intrin( "int32x4", @@ -812,7 +863,7 @@ def _instr(index): return te.decl_tensor_intrin( C.op, _intrin_func, - binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer}, + binds={A: aa_buffer, B: bb_buffer, C: cc_buffer}, default_buffer_params=buffer_params, ) From 0f8116a191f47b1942601944760b4a5e8b51e748 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Fri, 2 Oct 2020 18:27:07 +0100 Subject: [PATCH 10/10] Address review comments Change-Id: I63d1a639d4a72abeb33148fd2868cd356ef84122 --- python/tvm/topi/arm_cpu/tensor_intrin.py | 49 +++++++++++++++--------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index 9ed4c591da0f..73cfacb62079 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -644,7 +644,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype): Notes: * The rows of matrix B are transposed - This function returns a TensorIntrin that can be used to tensorize a schedule. + * The tiling strategy is picked to maximize register usage. Parameters ---------- @@ -659,8 +659,8 @@ def gemm_acc_4x4_int8_int8_int32(dtype): # This needs to be a variable number of "rows" since TVM # "thinks" I only need to compute one row because of # padding - A = te.placeholder((te.var("rows"), 4), dtype, name="data") - B = te.placeholder((4, 4), dtype, name="kernel") + A = te.placeholder((te.var("rows"), 4), dtype, name="A") + B = te.placeholder((4, 4), dtype, name="B") dtype_vec = dtype + "x16" k = te.reduce_axis((0, 4), name="k") @@ -692,8 +692,8 @@ def _instr(index): # Load all the elements of tile A. # vec_a = [a, b, c, d, # e, f, g, h, - # i, l, m, n, - # o, p, q, r,]; + # l, m, n, o, + # p, q, r, s]; vec_a = ins[0].vload([0, 0], dtype_vec) # Replicate 4 times the i-th row of A. For instance, @@ -753,15 +753,15 @@ def _instr(index): def gemm_acc_nx16_int8_int8_int32(dtype, rows): """ - Int8 16x4 matrix multiplication and accumulation using sdot/udot instructions - This function takes two arrays of int8 datatype -- A[rows][4] and + Int8 nx16 matrix multiplication and accumulation using sdot/udot instructions + This function takes two arrays of int8 datatype -- A[n][4] and B[4][16] and produces a rowsx16 matrix which is equal to A*B The pseudo code is as follows. .. code-block:: c - void mmla_16x4_int8_int8_int32(int8 A[rows][16], int8 B[4][16][4], int32 output[rows][16]){ - for (int i = 0; i < rows; i++){ + void mmla_nx16_int8_int8_int32(int8 A[n][16], int8 B[4][16][4], int32 output[n][16]){ + for (int i = 0; i < n; i++){ for (int j = 0; i < 16; i++){ for (int k = 0; k < 16; k++){ out[i][j] += A[i][k] * B[k//4][j][k%4] @@ -775,21 +775,22 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows): * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16 we need 4 tiles of B to compute a single row of the output. The first 4 values of k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on - - This function returns a TensorIntrin that can be used to tensorize a schedule. + * The tiling strategy is picked to maximize register usage. Parameters ---------- dtype: str, {"uint8", "int8"} Whether it works on unsigned int or signed int + rows: int + Number of of the output rows "n" Returns ------- intrin : TensorIntrin The Arm TensorIntrin that can be used in tensorizing schedule """ - A = te.placeholder((rows, 16), dtype, name="data") - B = te.placeholder((4, 16, 4), dtype, name="kernel") + A = te.placeholder((rows, 16), dtype, name="A") + B = te.placeholder((4, 16, 4), dtype, name="B") dtype_vec = dtype + "x16" idxm = tvm.tir.indexmod k = te.reduce_axis((0, 16), name="k") @@ -827,24 +828,34 @@ def _instr(index): # Iterate on the number of rows of the output for k in range(0, rows): # Load 16 elements of A - # vec_a = [a, b, c, e, f, g, h, i, l, m, n, o, p, q, r,]; + # vec_a = [a, b, c, d, e, f, g, h, l, m, n, o, p, q, r, s]; vec_a = ins[0].vload([k, 0], dtype_vec) - # Iterate over each column of the output + # Iterate over each of the 4 rowsx4 tiles of the output for j in range(0, 4): # Accumulate over each of the 4 (16x4) tiles contained in B for i in range(0, 4): - # As before, replicate a single 4-element group of A + # Replicate a single 4-element group of A (A[k, i:i+4]) vec_aa = select_word(vec_a, i, dtype_vec) - # Load 4 rows (each rows with 4 elements) from B + + # Load 4 rows (each rows with 4 elements) from B (B[i:i+4, j:j+4]) # vec_b = [0, 16, 32, 48, # 1, 17, 33, 49, # 2, 18, 34, 50, # 3, 19, 35, 51,]; vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec) - # Store the result of the accumulation in the - # correct part of the output + + # Accumulate in the correct part of the output vec_c = outs[0].vload([k, 4 * j], "int32x4") + + # Compute the dot product between the rowsx4 tile + # from A and the 4x4 tile from B + # + # For instance, for i=0, we have: + # sdot(vec_aa[0], vec_b) = [a*0+b*16+c*32+d*48, + # a*1+b*17+c*33+d*49, + # a*2+b*18+c*34+d*50, + # a*3+b*19+c*35+d*51] vdot = tvm.tir.call_llvm_intrin( "int32x4", llvm_intrin,