From ce6ea3720612917cc6ca3ba8b089800e540545a7 Mon Sep 17 00:00:00 2001 From: cchung100m Date: Tue, 5 May 2020 23:53:35 +0800 Subject: [PATCH 1/3] [AutoTVM][TOPI] AutoTVM incorrect measurement --- topi/python/topi/mali/conv2d.py | 38 ++++++++++++--------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index d19592857086..30cdc609c1a7 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -138,20 +138,15 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec): s[data_vec].unroll(vw) if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec': - if autotvm.GLOBAL_SCOPE.in_tuning: - # kernel packing will be pre-computed during compilation, so we skip - # this part to make tuning records correct - s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region') - else: - max_threads = tvm.target.Target.current(allow_none=False).max_num_threads - co, ci, kh, kw, vc = s[kernel_vec].op.axis - fused = s[kernel_vec].fuse(co, ci, kh, kw, vc) - fused, vec = s[kernel_vec].split(fused, VC) - bb, tt = s[kernel_vec].split(fused, max_threads) - s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x")) - s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x")) - if VC in vec_size: - s[kernel_vec].vectorize(vec) + max_threads = tvm.target.Target.current(allow_none=False).max_num_threads + co, ci, kh, kw, vc = s[kernel_vec].op.axis + fused = s[kernel_vec].fuse(co, ci, kh, kw, vc) + fused, vec = s[kernel_vec].split(fused, VC) + bb, tt = s[kernel_vec].split(fused, max_threads) + s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x")) + s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x")) + if VC in vec_size: + s[kernel_vec].vectorize(vec) # schedule convolution n, c, h, w, vh, vw, vc = s[conv].op.axis @@ -345,16 +340,11 @@ def _schedule_winograd(cfg, s, op): kernel, G = s[U].op.input_tensors s[G].compute_inline() eps, nu, co, ci, vco, = s[U].op.axis - if autotvm.GLOBAL_SCOPE.in_tuning: - # kernel transformation will be pre-computed during compilation, so we skip - # this part to make tuning records correct - s[U].pragma(eps, 'debug_skip_region') - else: - r_kh, r_kw = s[U].op.reduce_axis - s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco) - _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]] - s[U].vectorize(vco) - tile_and_bind(s, U, co, ci, 1, 256) + r_kh, r_kw = s[U].op.reduce_axis + s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco) + _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]] + s[U].vectorize(vco) + tile_and_bind(s, U, co, ci, 1, 256) # dilation if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: From 64d1e6a38d5c44deb86046d46a5e0c4167b224d8 Mon Sep 17 00:00:00 2001 From: cchung100m Date: Mon, 25 May 2020 21:05:28 +0800 Subject: [PATCH 2/3] create new placeholder with converted layout --- topi/python/topi/mali/conv2d.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index 30cdc609c1a7..91bdd360b1b2 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -138,15 +138,22 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec): s[data_vec].unroll(vw) if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec': - max_threads = tvm.target.Target.current(allow_none=False).max_num_threads - co, ci, kh, kw, vc = s[kernel_vec].op.axis - fused = s[kernel_vec].fuse(co, ci, kh, kw, vc) - fused, vec = s[kernel_vec].split(fused, VC) - bb, tt = s[kernel_vec].split(fused, max_threads) - s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x")) - s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x")) - if VC in vec_size: - s[kernel_vec].vectorize(vec) + if autotvm.GLOBAL_SCOPE.in_tuning: + # Directly use modified data layout placeholder. + co, ci, kh, kw, vc = s[kernel_vec].op.axis + kvshape = (co // vc, ci, kh, kw, vc) + kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel") + s[kernel_vec] = kernel_vec + else: + max_threads = tvm.target.Target.current(allow_none=False).max_num_threads + co, ci, kh, kw, vc = s[kernel_vec].op.axis + fused = s[kernel_vec].fuse(co, ci, kh, kw, vc) + fused, vec = s[kernel_vec].split(fused, VC) + bb, tt = s[kernel_vec].split(fused, max_threads) + s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x")) + s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x")) + if VC in vec_size: + s[kernel_vec].vectorize(vec) # schedule convolution n, c, h, w, vh, vw, vc = s[conv].op.axis From 1629b567270eab07d71121bf1fafd85bd3797c3e Mon Sep 17 00:00:00 2001 From: cchung100m Date: Tue, 26 May 2020 19:01:15 +0800 Subject: [PATCH 3/3] update _schedule_winograd --- topi/python/topi/mali/conv2d.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index 91bdd360b1b2..12eb3d7c78c8 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -138,15 +138,14 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec): s[data_vec].unroll(vw) if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec': + co, ci, kh, kw, vc = s[kernel_vec].op.axis if autotvm.GLOBAL_SCOPE.in_tuning: # Directly use modified data layout placeholder. - co, ci, kh, kw, vc = s[kernel_vec].op.axis kvshape = (co // vc, ci, kh, kw, vc) kernel_vec = tvm.te.placeholder(kvshape, kernel_vec.dtype, name="kernel") s[kernel_vec] = kernel_vec else: max_threads = tvm.target.Target.current(allow_none=False).max_num_threads - co, ci, kh, kw, vc = s[kernel_vec].op.axis fused = s[kernel_vec].fuse(co, ci, kh, kw, vc) fused, vec = s[kernel_vec].split(fused, VC) bb, tt = s[kernel_vec].split(fused, max_threads) @@ -347,11 +346,12 @@ def _schedule_winograd(cfg, s, op): kernel, G = s[U].op.input_tensors s[G].compute_inline() eps, nu, co, ci, vco, = s[U].op.axis - r_kh, r_kw = s[U].op.reduce_axis - s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco) - _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]] - s[U].vectorize(vco) - tile_and_bind(s, U, co, ci, 1, 256) + if not autotvm.GLOBAL_SCOPE.in_tuning: + r_kh, r_kw = s[U].op.reduce_axis + s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco) + _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]] + s[U].vectorize(vco) + tile_and_bind(s, U, co, ci, 1, 256) # dilation if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: