diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py index 4eed56a22572..78b6e4529223 100644 --- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py +++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py @@ -20,6 +20,7 @@ import tvm from tvm import te from tvm import autotvm +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity from .. import nn from ..utils import get_const_tuple from ..nn.utils import get_const_int, get_pad_tuple @@ -302,9 +303,29 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_ ) cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll") - cfg.define_annotate("ann_spatial", [ohi, owi, oci], policy="try_unroll_vec") + cfg.define_annotate("ann_spatial", [owi, oci], policy="try_unroll_vec") # ==================================================================== + # If there are no tuning records, use this config + if cfg.is_fallback: + + def _tile_size(axis, candidates): + for candidate in candidates: + tiles_divisible_by_candidate = axis % candidate == 0 + if tiles_divisible_by_candidate: + return candidate + return 1 + + # Tile size 8 results in efficient vectorization for these schedules. + # If the axis is not divisible by 8, try 4 + cfg["tile_oh"] = SplitEntity([-1, 1]) + cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])]) + cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])]) + cfg["ann_spatial"] = AnnotateEntity(["none", "vec"]) + cfg["ann_reduce"] = AnnotateEntity(["none", "none"]) + cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + cfg["compat"] = OtherOptionEntity(0) + OCI = cfg["tile_co"].size[-1] OHI = cfg["tile_oh"].size[-1] OWI = cfg["tile_ow"].size[-1] @@ -390,7 +411,7 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output): data_vec = conv.op.input_tensors[0] kernel_vec = conv.op.input_tensors[1] data_pad = data_vec.op.input_tensors[0] - OHI = cfg["tile_oh"].size[-1] + OWI = cfg["tile_ow"].size[-1] OCI = cfg["tile_co"].size[-1] @@ -402,20 +423,18 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output): oho, ohi = cfg["tile_oh"].apply(s, output, oh) owo, owi = cfg["tile_ow"].apply(s, output, ow) s[output].reorder(n, oho, owo, oco, ohi, owi, oci) - cfg["ann_spatial"].apply( - s, output, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg - ) - cfg.define_knob("compat", [0, 1, 2]) - if cfg["compat"].val < 2: - compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 - s[conv].compute_at(s[output], compat_axis) + cfg["ann_spatial"].apply(s, output, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg) + + cfg.define_knob("compat", [0, 1]) + compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 + s[conv].compute_at(s[output], compat_axis) paxis = s[output].fuse(n, oho) s[output].parallel(paxis) # schedule conv n, oho, owo, oco, ohi, owi, oci = s[conv].op.axis ic, kh, kw = s[conv].op.reduce_axis - cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ohi, owi, ic, oci]) + cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci]) cfg["ann_reduce"].apply( s, conv, @@ -424,33 +443,22 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output): max_unroll=16, cfg=cfg, ) - cfg["ann_spatial"].apply( - s, conv, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg - ) - if cfg["compat"].val < 2: - compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 - s[kernel_vec].compute_at(s[conv], compat_axis) - s[data_vec].compute_at(s[conv], compat_axis) - - if not autotvm.GLOBAL_SCOPE.in_tuning: - # schedule kernel pack - oco, kh, kw, ic, oci = kernel_vec.op.axis - s[kernel_vec].vectorize(oci) - s[kernel_vec].unroll(ic) - if cfg["compat"].val == 2: - s[kernel_vec].parallel(oco) - - # schedule data pack + cfg["ann_spatial"].apply(s, conv, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg) + + # schedule data_vec, data_pad and kernel_vec + compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 + s[kernel_vec].compute_at(s[conv], compat_axis) + s[data_vec].compute_at(s[conv], compat_axis) + + # Inlining kernel vec brings a performance improvement, but the tuner seems to not + # like it, so inline only when we are using the fallback config + if cfg.is_fallback: + s[kernel_vec].compute_inline() + if data_vec.op.name == "data_vec_undilated": n, oho, owo, kh, kw, ic, ohi, owi = s[data_vec].op.axis - s[data_vec].vectorize(owi) - s[data_vec].unroll(ohi) else: n, oho, owo, ohi, owi, ic = s[data_vec].op.axis - s[data_vec].vectorize(ic) - s[data_vec].unroll(owi) - if cfg["compat"].val == 2: - paxis = s[data_vec].fuse(n, oho) - s[data_vec].parallel(paxis) + s[data_pad].compute_at(s[data_vec], n) return s diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py index 73cdd9b85d28..8c39dc4f16da 100644 --- a/tests/micro/arduino/test_arduino_workflow.py +++ b/tests/micro/arduino/test_arduino_workflow.py @@ -95,7 +95,7 @@ def test_model_platform_templating(project_dir, project): # TVM causes the amount of memory needed to decrease. workspace_size = int(workspace_size_defs[0]) assert workspace_size < 30000 - assert workspace_size > 10000 + assert workspace_size > 9000 def test_import_rerouting(project_dir, project):