From 8855eedc3d87f8373efdb1b16b09404c2fbf0cb5 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Mon, 12 Oct 2020 14:51:12 +0100 Subject: [PATCH 1/6] Improve depthwise convolution through smlal/smlal2 intrinsic - Added an intrinsic to load a single int16x8 vector and produce two int32x4 output vectors through smlal/smlal2 instructions - Changed the NHWC depthwise schedule to accomodate the aforementioned intrinsic Change-Id: I347c3bf98fa8dd87057304dcda0d78e558424c57 --- python/tvm/topi/arm_cpu/depthwise_conv2d.py | 41 +++++++-- python/tvm/topi/arm_cpu/tensor_intrin.py | 90 +++++++++++++++++++ .../topi/python/test_topi_depthwise_conv2d.py | 54 +++++++++++ 3 files changed, 179 insertions(+), 6 deletions(-) diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index 3c32d3e1f3f2..0cae30c46b27 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -25,7 +25,8 @@ from .. import nn from ..utils import traverse_inline, get_const_tuple, get_const_int from ..nn.utils import get_pad_tuple - +from .tensor_intrin import smlal_int16_int32 +from .arm_utils import is_aarch64_arm @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu") def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype): @@ -222,7 +223,6 @@ def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, o output : tvm.te.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ - out_dtype = out_dtype or data.dtype N, IH, IW, IC = get_const_tuple(data.shape) @@ -298,11 +298,15 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs): cfg["tile_c"] = SplitEntity([-1, 8]) cfg["tile_h"] = SplitEntity([-1, 2]) cfg["tile_w"] = SplitEntity([-1, 2]) - cfg["locate_output"] = OtherOptionEntity(1) + cfg["locate_output"] = OtherOptionEntity(2) ##### space definition end ##### def schedule_conv(conv): conv_data = conv.op.input_tensors[0] + kernel_data = conv.op.input_tensors[1] + in_type = conv_data.dtype + + _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape) n, w, h, c = conv.op.axis r_h, r_w = conv.op.reduce_axis @@ -310,12 +314,13 @@ def schedule_conv(conv): wo, wi = cfg["tile_w"].apply(s, conv, w) co, ci = cfg["tile_c"].apply(s, conv, c) + data_pad_value = -1 if conv_data.name == "data_pad": assert isinstance(conv_data.op, tvm.te.ComputeOp) # Define a policy for padding computation cfg.define_knob("data_pad_inline", [1, 2, 3]) if cfg.is_fallback: - cfg["data_pad_inline"] = OtherOptionEntity(3) + cfg["data_pad_inline"] = OtherOptionEntity(2) if cfg["data_pad_inline"].val == 1: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], ho) @@ -324,10 +329,31 @@ def schedule_conv(conv): s[conv_data].compute_at(s[conv], wo) if cfg["data_pad_inline"].val == 3: s[conv_data].compute_inline() + data_pad_value = cfg["data_pad_inline"].val + + split_val = cfg["tile_c"].size[-1] + use_tensorization = ( + (in_type == "int16") + and (split_val == 8) + and (IC % split_val == 0) + and (channel_multiplier == 1) + and (data_pad_value != 3) + and is_aarch64_arm() + ) + + if use_tensorization: + smlal = smlal_int16_int32() + s[conv].tensorize(ci, smlal) + else: + s[conv].vectorize(ci) + + s[conv].unroll(r_h) + s[conv].unroll(r_w) + s[conv].unroll(wi) + s[conv].unroll(hi) s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci) fused_n_ho = s[conv].fuse(n, ho) - s[conv].vectorize(ci) return fused_n_ho def schedule_conv_out(out): @@ -335,13 +361,16 @@ def schedule_conv_out(out): co, ci = cfg["tile_c"].apply(s, out, c) wo, wi = cfg["tile_w"].apply(s, out, w) ho, hi = cfg["tile_h"].apply(s, out, h) - s[out].reorder(n, ho, wo, co, hi, wi) + s[out].reorder(n, ho, wo, co, hi, wi, ci) + s[out].unroll(wi) + s[out].unroll(hi) if out.dtype in ["int8", "uint8"]: # In case of quantized convolution further split the channel in batches of 4 elements # so that we can use arm intrinsics to run fixed_point_multiplication ci_outer, ci_inner = s[out].split(ci, 4) s[out].vectorize(ci_inner) + s[out].unroll(ci_outer) fused_n_ho = s[out].fuse(n, ho) return hi, wi, fused_n_ho diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index 196f788e6b8c..bd36d4ad5577 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -879,6 +879,96 @@ def _instr(index): ) +def smlal_int16_int32(): + """ + Intrinsic to be used in order to load two int16x8 vectors and multiply + them together through a pair of smlal/smlal2 instructions. The pseudo-code + for the algorithm is as follows: + + vec_a = vld1q_s16(A) + vec_b = vld1q_s16(B) + + vec_c[0:4] += vec_a[0:4]*vec_b[0:4] // -> smlal instruction + vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction + + So we load a single int16x8 vector and we accumulate its lower (0:4) and + higher part separately. + """ + int16_lanes = 8 + A = te.placeholder((int16_lanes,), dtype="int16", name="A") + B = te.placeholder((int16_lanes, 1), dtype="int16", name="B") + C = te.compute( + (int16_lanes,), + lambda i: A[i].astype("int32") * B[i, 0].astype("int32"), + name="C", + ) + + a_buffer = tvm.tir.decl_buffer( + A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1] + ) + b_buffer = tvm.tir.decl_buffer( + B.shape, + dtype="int16", + name="b_buffer", + offset_factor=1, + strides=[te.var("sb"), 1], + ) + c_buffer = tvm.tir.decl_buffer( + C.shape, + dtype="int32", + name="c_buffer", + offset_factor=1, + strides=[1], + ) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.tir.ir_builder.create() + if index == 1: + ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8"))) + return ib.get() + + vec_a = ins[0].vload([0], "int16x8") + vec_b = ins[1].vload([0, 0], "int16x8") + inst = "llvm.aarch64.neon.smull" + + # Lower part of the vector + vec_c_h = outs[0].vload([4], "int32x4") + vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a) + vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b) + vmull_h = tvm.tir.call_llvm_pure_intrin( + "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_h, vec_b_h + ) + vec_out_h = vec_c_h + vmull_h + + # Lower part of the vector + vec_c_l = outs[0].vload([0], "int32x4") + vec_a_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_a) + vec_b_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_b) + vmull_l = tvm.tir.call_llvm_pure_intrin( + "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_l, vec_b_l + ) + vec_out_l = vec_c_l + vmull_l + + # Combine higher and lower part in a single int32x8 vector to store + # (this will require two different STR instructions, since the + # length of a NEON vector is fixed at 128 + vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h) + ib.emit(outs[0].vstore(0, vec_out)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + buffer_params = {"offset_factor": 1} + return te.decl_tensor_intrin( + C.op, + _intrin_func, + binds={A: a_buffer, B: b_buffer, C: c_buffer}, + default_buffer_params=buffer_params, + ) + + def _q_multiply_shift_arm(op): """ Implementation of q_multiply_shift_arm through arm intrinsics diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py index d5fbef98593c..55d2fe0c4e52 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py @@ -56,6 +56,55 @@ } +def compile_depthwise_NHWC_int8_arm( + batch, + in_channel, + in_size, + kernel, + depth_multiplier, + stride, + padding, + add_bias=False, + dilation=1, +): + pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) + padding_sum = pad_top + pad_left + pad_bottom + pad_right + + in_height = in_width = in_size + A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int16") + W = te.placeholder((kernel, kernel, in_channel, depth_multiplier), name="W", dtype="int16") + bias = te.placeholder((in_channel * depth_multiplier,), name="bias", dtype="int32") + dtype = "int32" + + device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu" + compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc + schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc + + if not tvm.testing.device_enabled(device): + print("Skip because %s is not enabled" % device) + return + + print("Compiling on arm AArch64 target: %s" % device) + with tvm.target.Target(device): + assert topi.arm_cpu.arm_utils.is_aarch64_arm(), "AArch64 target not recognized" + + C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) + if add_bias: + C += bias + ins_outs = [A, W, bias, C] + else: + ins_outs = [A, W, C] + + s = schedule([C]) + + func = tvm.build( + s, + ins_outs, + device, + name="depthwise_conv2d", + ) + + def depthwise_conv2d_with_workload_nchw( batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1 ): @@ -478,6 +527,7 @@ def test_depthwise_conv2d(): depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME") depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID") depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID") + # dilation = 2 # disabled because it uses too large shared memory on cuda # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2) @@ -487,6 +537,10 @@ def test_depthwise_conv2d(): depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME") depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID") + # Test compilation on arm devices + compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 3, 1, "SAME") + compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 1, 1, "SAME", True) + if __name__ == "__main__": test_depthwise_conv2d() From 155ab8ab65912315c39188d91129dde285ea97bb Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 29 Oct 2020 18:29:08 +0000 Subject: [PATCH 2/6] Address review comments --- python/tvm/topi/arm_cpu/depthwise_conv2d.py | 66 +++++++++++++-------- python/tvm/topi/arm_cpu/tensor_intrin.py | 8 +-- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index 0cae30c46b27..eff095a6b1b8 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -288,17 +288,26 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs): ##### space definition begin ##### n, h, w, c = s[out].op.axis + # Split the number of input/output channels cfg.define_split("tile_c", c, num_outputs=2) + # Split the height of the convolution _, hi = cfg.define_split("tile_h", h, num_outputs=2) + # Split the width of the convolution _, wi = cfg.define_split("tile_w", w, num_outputs=2) + # Additional out (e.g., requantization, bias addition, etc..) + # 0: locate the output on the second last axis of the main compuation + # 1: locate the output closest to the main computation cfg.define_knob("locate_output", [0, 1]) + # Determine if we should unroll the computation of the inner tile + cfg.define_knob("unroll_tile", [True, False]) # fallback support if cfg.is_fallback: cfg["tile_c"] = SplitEntity([-1, 8]) cfg["tile_h"] = SplitEntity([-1, 2]) cfg["tile_w"] = SplitEntity([-1, 2]) - cfg["locate_output"] = OtherOptionEntity(2) + cfg["locate_output"] = OtherOptionEntity(1) + cfg["unroll_tile"] = OtherOptionEntity(True) ##### space definition end ##### def schedule_conv(conv): @@ -314,43 +323,49 @@ def schedule_conv(conv): wo, wi = cfg["tile_w"].apply(s, conv, w) co, ci = cfg["tile_c"].apply(s, conv, c) + split_val = cfg["tile_c"].size[-1] + use_tensorization = ( + (in_type == "int16") + and (split_val == 8) + and (IC % split_val == 0) + and (channel_multiplier == 1) + and is_aarch64_arm() + ) + data_pad_value = -1 if conv_data.name == "data_pad": assert isinstance(conv_data.op, tvm.te.ComputeOp) - # Define a policy for padding computation - cfg.define_knob("data_pad_inline", [1, 2, 3]) + # Define a strategy for padding computation + cfg.define_knob("data_pad_strategy", [1, 2, 3]) if cfg.is_fallback: - cfg["data_pad_inline"] = OtherOptionEntity(2) - if cfg["data_pad_inline"].val == 1: + # We cannot inline padding when tensorizing. So, if we can tensorize, let's compute_at the closest axis + cfg["data_pad_strategy"] = ( + OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3) + ) + # Compute padding on the third to last axis of the computation + if cfg["data_pad_strategy"].val == 1: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], ho) - if cfg["data_pad_inline"].val == 2: + # Compute padding on the second to last axis of the computation + if cfg["data_pad_strategy"].val == 2: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], wo) - if cfg["data_pad_inline"].val == 3: + # Inline padding during computation + if cfg["data_pad_strategy"].val == 3: s[conv_data].compute_inline() - data_pad_value = cfg["data_pad_inline"].val - - split_val = cfg["tile_c"].size[-1] - use_tensorization = ( - (in_type == "int16") - and (split_val == 8) - and (IC % split_val == 0) - and (channel_multiplier == 1) - and (data_pad_value != 3) - and is_aarch64_arm() - ) + data_pad_value = cfg["data_pad_strategy"].val - if use_tensorization: + if use_tensorization and data_pad_value != 3: smlal = smlal_int16_int32() s[conv].tensorize(ci, smlal) else: s[conv].vectorize(ci) - s[conv].unroll(r_h) - s[conv].unroll(r_w) - s[conv].unroll(wi) - s[conv].unroll(hi) + if cfg["unroll_tile"].val: + s[conv].unroll(r_h) + s[conv].unroll(r_w) + s[conv].unroll(wi) + s[conv].unroll(hi) s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci) fused_n_ho = s[conv].fuse(n, ho) @@ -362,8 +377,9 @@ def schedule_conv_out(out): wo, wi = cfg["tile_w"].apply(s, out, w) ho, hi = cfg["tile_h"].apply(s, out, h) s[out].reorder(n, ho, wo, co, hi, wi, ci) - s[out].unroll(wi) - s[out].unroll(hi) + if cfg["unroll_tile"]: + s[out].unroll(wi) + s[out].unroll(hi) if out.dtype in ["int8", "uint8"]: # In case of quantized convolution further split the channel in batches of 4 elements diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index bd36d4ad5577..1b999dfe4e80 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -885,8 +885,8 @@ def smlal_int16_int32(): them together through a pair of smlal/smlal2 instructions. The pseudo-code for the algorithm is as follows: - vec_a = vld1q_s16(A) - vec_b = vld1q_s16(B) + vec_a = vload(A, "int16x8") + vec_b = vload(B, "int16x8") vec_c[0:4] += vec_a[0:4]*vec_b[0:4] // -> smlal instruction vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction @@ -932,7 +932,7 @@ def _instr(index): vec_b = ins[1].vload([0, 0], "int16x8") inst = "llvm.aarch64.neon.smull" - # Lower part of the vector + # Higher part of the vector vec_c_h = outs[0].vload([4], "int32x4") vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a) vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b) @@ -951,7 +951,7 @@ def _instr(index): vec_out_l = vec_c_l + vmull_l # Combine higher and lower part in a single int32x8 vector to store - # (this will require two different STR instructions, since the + # (this will require two different store instructions, since the # length of a NEON vector is fixed at 128 vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h) ib.emit(outs[0].vstore(0, vec_out)) From efb8ef9d3c8cefc40e29f8662cf0eda418bd8728 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 29 Oct 2020 22:39:52 +0000 Subject: [PATCH 3/6] Rebasing - 2 From 3149923fd52ee1321472d72a2fceecc8bae63f00 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 29 Oct 2020 22:44:06 +0000 Subject: [PATCH 4/6] Rebasing - 3 --- python/tvm/topi/arm_cpu/depthwise_conv2d.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index eff095a6b1b8..d116892a78dd 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -28,6 +28,7 @@ from .tensor_intrin import smlal_int16_int32 from .arm_utils import is_aarch64_arm + @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu") def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype): """Compute depthwise_conv2d with NCHW layout""" From c825a4e31666427eb6eeb3e11fa007013ecb867a Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 29 Oct 2020 22:44:26 +0000 Subject: [PATCH 5/6] Rebasing - 3 From 5034afb62573446c12bf429a68661465352f17db Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Fri, 30 Oct 2020 12:18:16 +0000 Subject: [PATCH 6/6] Fix linting --- python/tvm/topi/arm_cpu/depthwise_conv2d.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index d116892a78dd..441b0a5a3688 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -339,7 +339,8 @@ def schedule_conv(conv): # Define a strategy for padding computation cfg.define_knob("data_pad_strategy", [1, 2, 3]) if cfg.is_fallback: - # We cannot inline padding when tensorizing. So, if we can tensorize, let's compute_at the closest axis + # We cannot inline padding when tensorizing. + # So, if we can tensorize, let's compute_at the closest axis cfg["data_pad_strategy"] = ( OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3) )