From ab29a0d45829235ef6c24ea78eb4a1e03d363018 Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Wed, 15 Feb 2023 16:19:28 +0000 Subject: [PATCH 1/5] [TOPI] Bugfix arm_cpu schedule_conv2d_spatial_pack_nhwc schedule No changes to the compute, various bugfixes and improvements to the corresponding NHWC schedule: * There is currently a block that is not run as a part of tuning trials, but gets run during compilation with tuning logs. Since a lot of unrolling and vectorization happens there, for some conv2d operators the extra vectorizing and unrolling results in about 18x size increase in asm and can take around 10 minutes per operator to compile. That essentially makes whole networks uncompilable, so remove that block. * There is no fallback config or NHWC logs in the TopHub. So add a fallback config. This significantly reduces the no tuning compile time, e.g. by about 10x for mobilenet. * The order of axis we passed to reorder_config was different to the order that was used to define the reorder. By looking at the compute definition and based on tuning results of whole networks, it seems to be a bug. * Constrain potential unrolling to OWI and OCI axis as unrolling across OHI results in uncompilably huge code size. This change reduces the number of unsuccessful tuning trials from about 50% to about 20%. * Other minor tweaks. Change-Id: I426b80154ddae96bf7b9f06e05e178eee2a8b087 --- .../tvm/topi/arm_cpu/conv2d_spatial_pack.py | 76 +++++++++---------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py index 4eed56a22572..43fc4f413dff 100644 --- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py +++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py @@ -23,6 +23,7 @@ from .. import nn from ..utils import get_const_tuple from ..nn.utils import get_const_int, get_pad_tuple +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile): @@ -302,15 +303,25 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_ ) cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll") - cfg.define_annotate("ann_spatial", [ohi, owi, oci], policy="try_unroll_vec") + cfg.define_annotate("ann_spatial", [owi, oci], policy="try_unroll_vec") # ==================================================================== - OCI = cfg["tile_co"].size[-1] + # If there are no tuning records, use this config + if cfg.is_fallback: + cfg["tile_oh"] = SplitEntity([-1, 1]) + cfg["tile_ow"] = SplitEntity([-1, 8]) + cfg["tile_oc"] = SplitEntity([-1, 8]) + cfg["ann_spatial"] = AnnotateEntity(["none", "vec"]) + cfg["ann_reduce"] = AnnotateEntity(["none", "none"]) + cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + cfg["compat"] = OtherOptionEntity(0) + + OCI = cfg["tile_oc"].size[-1] OHI = cfg["tile_oh"].size[-1] OWI = cfg["tile_ow"].size[-1] - OCO = OC // OCI + OCO = max(1, OC // OCI) OHO = OH // OHI - OWO = OW // OWI + OWO = max(1, OW // OWI) kvshape = (OCO, KH, KW, IC, OCI) ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI) @@ -390,32 +401,30 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output): data_vec = conv.op.input_tensors[0] kernel_vec = conv.op.input_tensors[1] data_pad = data_vec.op.input_tensors[0] - OHI = cfg["tile_oh"].size[-1] + OWI = cfg["tile_ow"].size[-1] - OCI = cfg["tile_co"].size[-1] + OCI = cfg["tile_oc"].size[-1] # schedule unpack/output if output != unpack: s[unpack].compute_inline() n, oh, ow, oc = s[output].op.axis - oco, oci = cfg["tile_co"].apply(s, output, oc) + oco, oci = cfg["tile_oc"].apply(s, output, oc) oho, ohi = cfg["tile_oh"].apply(s, output, oh) owo, owi = cfg["tile_ow"].apply(s, output, ow) s[output].reorder(n, oho, owo, oco, ohi, owi, oci) - cfg["ann_spatial"].apply( - s, output, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg - ) - cfg.define_knob("compat", [0, 1, 2]) - if cfg["compat"].val < 2: - compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 - s[conv].compute_at(s[output], compat_axis) + cfg["ann_spatial"].apply(s, output, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg) + + cfg.define_knob("compat", [0, 1]) + compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 + s[conv].compute_at(s[output], compat_axis) paxis = s[output].fuse(n, oho) s[output].parallel(paxis) # schedule conv n, oho, owo, oco, ohi, owi, oci = s[conv].op.axis ic, kh, kw = s[conv].op.reduce_axis - cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ohi, owi, ic, oci]) + cfg["reorder_conv"].apply(s, conv, [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci]) cfg["ann_reduce"].apply( s, conv, @@ -424,33 +433,22 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output): max_unroll=16, cfg=cfg, ) - cfg["ann_spatial"].apply( - s, conv, [ohi, owi, oci], axis_lens=[OHI, OWI, OCI], max_unroll=16, cfg=cfg - ) - if cfg["compat"].val < 2: - compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 - s[kernel_vec].compute_at(s[conv], compat_axis) - s[data_vec].compute_at(s[conv], compat_axis) - - if not autotvm.GLOBAL_SCOPE.in_tuning: - # schedule kernel pack - oco, kh, kw, ic, oci = kernel_vec.op.axis - s[kernel_vec].vectorize(oci) - s[kernel_vec].unroll(ic) - if cfg["compat"].val == 2: - s[kernel_vec].parallel(oco) - - # schedule data pack + cfg["ann_spatial"].apply(s, conv, [owi, oci], axis_lens=[OWI, OCI], max_unroll=16, cfg=cfg) + + # schedule data_vec, data_pad and kernel_vec + compat_axis = [owo, oco][cfg["compat"].val] # pylint: disable=R1706 + s[kernel_vec].compute_at(s[conv], compat_axis) + s[data_vec].compute_at(s[conv], compat_axis) + + # Inlining kernel vec brings a performance improvement, but the tuner seems to not + # like it, so inline only when we are using the fallback config + if cfg.is_fallback: + s[kernel_vec].compute_inline() + if data_vec.op.name == "data_vec_undilated": n, oho, owo, kh, kw, ic, ohi, owi = s[data_vec].op.axis - s[data_vec].vectorize(owi) - s[data_vec].unroll(ohi) else: n, oho, owo, ohi, owi, ic = s[data_vec].op.axis - s[data_vec].vectorize(ic) - s[data_vec].unroll(owi) - if cfg["compat"].val == 2: - paxis = s[data_vec].fuse(n, oho) - s[data_vec].parallel(paxis) + s[data_pad].compute_at(s[data_vec], n) return s From 3ca715e6c643f87849e967191cc48ba8470407eb Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Thu, 16 Feb 2023 15:16:17 +0000 Subject: [PATCH 2/5] Fix linting and bug in the default config --- python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py index 43fc4f413dff..5a80fef045c5 100644 --- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py +++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py @@ -20,10 +20,10 @@ import tvm from tvm import te from tvm import autotvm +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity from .. import nn from ..utils import get_const_tuple from ..nn.utils import get_const_int, get_pad_tuple -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, AnnotateEntity, ReorderEntity def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile): @@ -308,9 +308,17 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_ # If there are no tuning records, use this config if cfg.is_fallback: + + def _tile_size(axis, candidates): + for candidate in candidates: + tiles_divisible_by_candidate = axis % candidate == 0 + if tiles_divisible_by_candidate: + return candidate + return 1 + cfg["tile_oh"] = SplitEntity([-1, 1]) - cfg["tile_ow"] = SplitEntity([-1, 8]) - cfg["tile_oc"] = SplitEntity([-1, 8]) + cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])]) + cfg["tile_oc"] = SplitEntity([-1, _tile_size(OC, [8, 4])]) cfg["ann_spatial"] = AnnotateEntity(["none", "vec"]) cfg["ann_reduce"] = AnnotateEntity(["none", "none"]) cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) @@ -319,9 +327,9 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_ OCI = cfg["tile_oc"].size[-1] OHI = cfg["tile_oh"].size[-1] OWI = cfg["tile_ow"].size[-1] - OCO = max(1, OC // OCI) + OCO = OC // OCI OHO = OH // OHI - OWO = max(1, OW // OWI) + OWO = OW // OWI kvshape = (OCO, KH, KW, IC, OCI) ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI) From 4308191c6d15144ac7f934387e24214d6de0a899 Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Mon, 20 Feb 2023 10:30:25 +0000 Subject: [PATCH 3/5] Reduce the minimum ws size in Arduino test and tile_oc->tile_co Looks like tile_co as a name is deeply ingrained into the codebase... --- python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 8 ++++---- tests/micro/arduino/test_arduino_workflow.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py index 5a80fef045c5..eae4f2790372 100644 --- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py +++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py @@ -318,13 +318,13 @@ def _tile_size(axis, candidates): cfg["tile_oh"] = SplitEntity([-1, 1]) cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])]) - cfg["tile_oc"] = SplitEntity([-1, _tile_size(OC, [8, 4])]) + cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])]) cfg["ann_spatial"] = AnnotateEntity(["none", "vec"]) cfg["ann_reduce"] = AnnotateEntity(["none", "none"]) cfg["reorder_conv"] = ReorderEntity([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) cfg["compat"] = OtherOptionEntity(0) - OCI = cfg["tile_oc"].size[-1] + OCI = cfg["tile_co"].size[-1] OHI = cfg["tile_oh"].size[-1] OWI = cfg["tile_ow"].size[-1] OCO = OC // OCI @@ -411,13 +411,13 @@ def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output): data_pad = data_vec.op.input_tensors[0] OWI = cfg["tile_ow"].size[-1] - OCI = cfg["tile_oc"].size[-1] + OCI = cfg["tile_co"].size[-1] # schedule unpack/output if output != unpack: s[unpack].compute_inline() n, oh, ow, oc = s[output].op.axis - oco, oci = cfg["tile_oc"].apply(s, output, oc) + oco, oci = cfg["tile_co"].apply(s, output, oc) oho, ohi = cfg["tile_oh"].apply(s, output, oh) owo, owi = cfg["tile_ow"].apply(s, output, ow) s[output].reorder(n, oho, owo, oco, ohi, owi, oci) diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py index 73cdd9b85d28..8c39dc4f16da 100644 --- a/tests/micro/arduino/test_arduino_workflow.py +++ b/tests/micro/arduino/test_arduino_workflow.py @@ -95,7 +95,7 @@ def test_model_platform_templating(project_dir, project): # TVM causes the amount of memory needed to decrease. workspace_size = int(workspace_size_defs[0]) assert workspace_size < 30000 - assert workspace_size > 10000 + assert workspace_size > 9000 def test_import_rerouting(project_dir, project): From 140e0ba3b203ed953112e937a43ff9c8d6e8133a Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Tue, 21 Feb 2023 16:35:30 +0000 Subject: [PATCH 4/5] Add a comment about default config --- python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py index eae4f2790372..a2429d1b2304 100644 --- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py +++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py @@ -316,6 +316,8 @@ def _tile_size(axis, candidates): return candidate return 1 + # Tile size 8 results in efficient vectorization for these schedules. + # If the axis is not divisible by 8, try 4 cfg["tile_oh"] = SplitEntity([-1, 1]) cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])]) cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])]) From 5a0ec43a3a39ecb7e99ff10e1bebb2c95e99d01f Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Tue, 21 Feb 2023 17:27:07 +0000 Subject: [PATCH 5/5] Lint the comment -_- --- python/tvm/topi/arm_cpu/conv2d_spatial_pack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py index a2429d1b2304..78b6e4529223 100644 --- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py +++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py @@ -317,7 +317,7 @@ def _tile_size(axis, candidates): return 1 # Tile size 8 results in efficient vectorization for these schedules. - # If the axis is not divisible by 8, try 4 + # If the axis is not divisible by 8, try 4 cfg["tile_oh"] = SplitEntity([-1, 1]) cfg["tile_ow"] = SplitEntity([-1, _tile_size(OW, [8, 4])]) cfg["tile_co"] = SplitEntity([-1, _tile_size(OC, [8, 4])])