From b86700fe679a9456f2e5740bdb75b02e2fc0c2a9 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Wed, 22 Jul 2020 14:31:44 +0800 Subject: [PATCH 01/18] fix nms for gpu --- tests/python/relay/test_op_level5.py | 8 +- topi/python/topi/cuda/nms.py | 157 ++++++++++++++++++++++----- 2 files changed, 130 insertions(+), 35 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 3a94fc69e001..aa4a6e064d60 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -270,9 +270,9 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): intrp = relay.create_executor("debug", ctx=ctx, target=target) out = intrp.evaluate(func)(np_data) tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) - # get_valid_count for cuda, opencl doesn't do data rearrangement - if target in ['cuda', 'opencl']: - return + # get_valid_count for opencl doesn't do data rearrangement + if target in ['opencl']: + continue tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04) @@ -321,8 +321,6 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) - if target == 'cuda': - return op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5) op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 4772080a60eb..00e410707610 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -93,8 +93,6 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) out_indices = ib.buffer_ptr(out_indices) - atomic_add_return = ib.allocate( - valid_count.dtype, (1,), name='atomic_add_return', scope='local') one_count = tvm.tir.const(1, dtype=valid_count.dtype) one = tvm.tir.const(1, dtype=out.dtype) score_threshold = tvm.ir.make_node( @@ -102,35 +100,29 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index) score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index) - max_threads = int(tvm.target.Target.current( - allow_none=False).max_num_threads) - nthread_tx = max_threads - nthread_bx = batch_size * num_anchors // max_threads + 1 + nthread_tx = batch_size + nthread_bx = 1 tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) - tid = bx * max_threads + tx - idxd = tvm.tir.indexdiv - - # initialize valid_count - with ib.if_scope(tid < batch_size): - valid_count[tid] = 0 - with ib.if_scope(tid < batch_size * num_anchors): - i = idxd(tid, num_anchors) + tid = tx + + # each thread process one batch + valid_count[tid] = 0 + with ib.for_range(0, num_anchors) as anchor_ind: + with ib.for_range(0, elem_length) as k: + out[anchor_ind * elem_length + k] = -one + out_indices[anchor_ind + tid*num_anchors] = -one_count + + with ib.for_range(0, num_anchors) as anchor_ind: with ib.if_scope( - tvm.tir.all(data[tid * elem_length + score_index] > score_threshold, - tvm.tir.any(id_index < 0, data[tid * elem_length + id_index] >= 0))): - atomic_add_return[0] = atomic_add(tvm.tir.call_intrin("handle", "tir.address_of", - valid_count[i]), one_count) + tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold, + tvm.tir.any(id_index < 0, data[anchor_ind * elem_length + id_index] >= 0))): + valid_count[tid] = valid_count[tid] + 1 with ib.for_range(0, elem_length) as k: - out[tid * elem_length + k] = data[tid * elem_length + k] - out_indices[tid + k] = tid + k - with ib.else_scope(): - with ib.for_range(0, elem_length) as k: - out[tid * elem_length + k] = -one - out_indices[tid + k] = -one_count - + out[(valid_count[tid]-1) * elem_length + k] = data[anchor_ind * elem_length + k] + out_indices[(valid_count[tid]-1) + tid*num_anchors] = anchor_ind return ib.get() @@ -184,7 +176,101 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): return [valid_count, out, out_indices] -def nms_ir(data, sorted_index, valid_count, out, box_indices, +def rearrange_indices_out_ir(data, output, valid_box_count): + """Low level IR to get rearrange_indices_out. + Parameters + ---------- + data : Buffer + Input data. 2-D Buffer with shape [batch_size, num_anchors]. + + output: Buffer + 2-D Buffer with shape [batch_size, num_anchors]. + + valid_box_count : Buffer + 2-D Buffer with shape [batch_size, 1]. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + batch_size = data.shape[0] + num_anchors = data.shape[1] + ib = tvm.tir.ir_builder.create() + + data = ib.buffer_ptr(data) + output = ib.buffer_ptr(output) + valid_box_count = ib.buffer_ptr(valid_box_count) + + one = tvm.tir.const(1, dtype=output.dtype) + zero = tvm.tir.const(0, dtype=output.dtype) + + nthread_tx = batch_size + nthread_bx = 1 + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = tx + + valid_box_count[tid] = 0 + with ib.for_range(0, num_anchors) as anchor_ind: + output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind] + ''' + valid_box_count[tid] = 0 + with ib.for_range(0, num_anchors) as anchor_ind: + with ib.if_scope(data[tid * num_anchors + anchor_ind] >= zero): + output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind] + valid_box_count[tid] = valid_box_count[tid] + 1 + with ib.if_scope(tvm.tir.any(data[tid * num_anchors + anchor_ind] > num_anchors, + data[tid * num_anchors + anchor_ind] < -num_anchors)): + output[tid * num_anchors + valid_box_count[tid]] = zero + valid_box_count[tid] = valid_box_count[tid] + 1 + with ib.if_scope(anchor_ind >= valid_box_count[tid]): + output[tid * num_anchors + anchor_ind] = -one + ''' + return ib.get() + + +def rearrange_indices_out(data): + """Rearrange nms output to move all valid entries to top. + + Parameters + ---------- + data : tvm.te.Tensor or numpy NDArray + NMS output. 2-D + tensor with shape [batch_size, num_anchors]. + + Returns + ------- + output : tvm.te.Tensor or numpy NDArray + 2-D tensor with shape [batch_size, num_anchors]. + + valid_box_count : tvm.te.Tensor or numpy NDArray + Tensor with shape [batch_size, 1], indicates + the valid number of boxes. + """ + batch_size = data.shape[0] + num_anchors = data.shape[1] + data_buf = tvm.tir.decl_buffer( + data.shape, data.dtype, "data_buf", data_alignment=8) + out_indices_buf = tvm.tir.decl_buffer( + data.shape, data.dtype, "out_indices_buf", data_alignment=8) + valid_count_buf = tvm.tir.decl_buffer( + (batch_size, 1), "int32", "valid_count_buf", data_alignment=8) + + output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape], + [data], + lambda ins, outs: rearrange_indices_out_ir( + ins[0], outs[0], outs[1]), + in_buffers=[data_buf], + out_buffers=[out_indices_buf, valid_count_buf], + name="rearrange_indices_out", + tag="rearrange_indices_out_gpu") + return [output, valid_box_count] + + +def nms_ir(data, sorted_index, valid_count, indices, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index): """Low level IR routing for transform location in multibox_detection operator. @@ -200,6 +286,9 @@ def nms_ir(data, sorted_index, valid_count, out, box_indices, valid_count : Buffer Buffer of number of valid output boxes. + indices : Buffer + Buffer represents the index of box in original data. + out : Buffer Output buffer. @@ -253,6 +342,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) + indices = ib.buffer_ptr(indices) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate( @@ -346,6 +436,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): box_indices[i * num_anchors + j] = -1 with ib.else_scope(): num_valid_boxes[0] += 1 + # convert box_indices to represent original data + org_idx = box_indices[i * num_anchors + j] + with ib.if_scope(box_indices[i * num_anchors + j] >= 0): + box_indices[i * num_anchors + j] = indices[i * num_anchors + org_idx] return ib.get() @@ -454,19 +548,22 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, data_buf = tvm.tir.decl_buffer( data.shape, data.dtype, "data_buf", data_alignment=8) + indices_buf = tvm.tir.decl_buffer( + indices.shape, indices.dtype, "indices_buf", data_alignment=8) + out, box_indices = \ te.extern([data.shape, score_shape], - [data, sort_tensor, valid_count], + [data, sort_tensor, valid_count, indices], lambda ins, outs: nms_ir( - ins[0], ins[1], ins[2], outs[0], outs[1], + ins[0], ins[1], ins[2], ins[3], outs[0], outs[1], max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index), dtype=[data.dtype, "int32"], - in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], + in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf], name="nms", tag="nms") # TODO(yongwww): Update cuda nms to be consistent with cpu version if return_indices: - return box_indices + return rearrange_indices_out(box_indices) return out From 84ba8a8cb90083b3ec41170f1aab12c957cacbdd Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Wed, 22 Jul 2020 15:03:26 +0800 Subject: [PATCH 02/18] fix pylint error --- topi/python/topi/cuda/nms.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 00e410707610..26dcc5b508fe 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -118,7 +118,8 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, with ib.for_range(0, num_anchors) as anchor_ind: with ib.if_scope( tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold, - tvm.tir.any(id_index < 0, data[anchor_ind * elem_length + id_index] >= 0))): + tvm.tir.any(id_index < 0, + data[anchor_ind * elem_length + id_index] >= 0))): valid_count[tid] = valid_count[tid] + 1 with ib.for_range(0, elem_length) as k: out[(valid_count[tid]-1) * elem_length + k] = data[anchor_ind * elem_length + k] @@ -202,9 +203,6 @@ def rearrange_indices_out_ir(data, output, valid_box_count): output = ib.buffer_ptr(output) valid_box_count = ib.buffer_ptr(valid_box_count) - one = tvm.tir.const(1, dtype=output.dtype) - zero = tvm.tir.const(0, dtype=output.dtype) - nthread_tx = batch_size nthread_bx = 1 tx = te.thread_axis("threadIdx.x") @@ -213,22 +211,9 @@ def rearrange_indices_out_ir(data, output, valid_box_count): ib.scope_attr(bx, "thread_extent", nthread_bx) tid = tx - valid_box_count[tid] = 0 + valid_box_count[tid] = 0 with ib.for_range(0, num_anchors) as anchor_ind: output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind] - ''' - valid_box_count[tid] = 0 - with ib.for_range(0, num_anchors) as anchor_ind: - with ib.if_scope(data[tid * num_anchors + anchor_ind] >= zero): - output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind] - valid_box_count[tid] = valid_box_count[tid] + 1 - with ib.if_scope(tvm.tir.any(data[tid * num_anchors + anchor_ind] > num_anchors, - data[tid * num_anchors + anchor_ind] < -num_anchors)): - output[tid * num_anchors + valid_box_count[tid]] = zero - valid_box_count[tid] = valid_box_count[tid] + 1 - with ib.if_scope(anchor_ind >= valid_box_count[tid]): - output[tid * num_anchors + anchor_ind] = -one - ''' return ib.get() @@ -251,7 +236,6 @@ def rearrange_indices_out(data): the valid number of boxes. """ batch_size = data.shape[0] - num_anchors = data.shape[1] data_buf = tvm.tir.decl_buffer( data.shape, data.dtype, "data_buf", data_alignment=8) out_indices_buf = tvm.tir.decl_buffer( From e95d2a47291ef4f849548b329540beac0b957e89 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Wed, 22 Jul 2020 15:10:09 +0800 Subject: [PATCH 03/18] fix pylint error --- topi/python/topi/cuda/nms.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 26dcc5b508fe..6f7e3c7f2bf2 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -118,7 +118,7 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, with ib.for_range(0, num_anchors) as anchor_ind: with ib.if_scope( tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold, - tvm.tir.any(id_index < 0, + tvm.tir.any(id_index < 0, data[anchor_ind * elem_length + id_index] >= 0))): valid_count[tid] = valid_count[tid] + 1 with ib.for_range(0, elem_length) as k: @@ -210,7 +210,7 @@ def rearrange_indices_out_ir(data, output, valid_box_count): ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = tx - + valid_box_count[tid] = 0 with ib.for_range(0, num_anchors) as anchor_ind: output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind] @@ -242,8 +242,8 @@ def rearrange_indices_out(data): data.shape, data.dtype, "out_indices_buf", data_alignment=8) valid_count_buf = tvm.tir.decl_buffer( (batch_size, 1), "int32", "valid_count_buf", data_alignment=8) - - output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape], + + output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape], [data], lambda ins, outs: rearrange_indices_out_ir( ins[0], outs[0], outs[1]), From 4b4e94c6f9a014cf95db304dbee33137d5067fe3 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Wed, 22 Jul 2020 20:11:54 +0800 Subject: [PATCH 04/18] fix get_valid_counts for batch larger than 1 --- topi/python/topi/cuda/nms.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 6f7e3c7f2bf2..1df1391e4735 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -110,20 +110,24 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, # each thread process one batch valid_count[tid] = 0 + data_base_ind = tid * num_anchors * elem_length + ind_base_ind = tid * num_anchors with ib.for_range(0, num_anchors) as anchor_ind: with ib.for_range(0, elem_length) as k: - out[anchor_ind * elem_length + k] = -one - out_indices[anchor_ind + tid*num_anchors] = -one_count + out[data_base_ind + anchor_ind * elem_length + k] = -one + out_indices[ind_base_ind + anchor_ind] = -one_count with ib.for_range(0, num_anchors) as anchor_ind: with ib.if_scope( - tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold, - tvm.tir.any(id_index < 0, - data[anchor_ind * elem_length + id_index] >= 0))): + tvm.tir.all( + data[data_base_ind + anchor_ind * elem_length + score_index] > score_threshold, + tvm.tir.any(id_index < 0, + data[data_base_ind + anchor_ind * elem_length + id_index] >= 0))): valid_count[tid] = valid_count[tid] + 1 with ib.for_range(0, elem_length) as k: - out[(valid_count[tid]-1) * elem_length + k] = data[anchor_ind * elem_length + k] - out_indices[(valid_count[tid]-1) + tid*num_anchors] = anchor_ind + out[data_base_ind + (valid_count[tid]-1) * elem_length + k] = \ + data[data_base_ind + anchor_ind * elem_length + k] + out_indices[ind_base_ind + (valid_count[tid]-1)] = anchor_ind return ib.get() From f8040723b70a49ba21adcd653b83dad6e96348e9 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Thu, 23 Jul 2020 10:01:11 +0800 Subject: [PATCH 05/18] remove add_atomic --- tests/python/relay/test_op_level5.py | 3 --- topi/python/topi/cuda/nms.py | 26 -------------------------- 2 files changed, 29 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index aa4a6e064d60..328a1f27e22d 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -270,9 +270,6 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): intrp = relay.create_executor("debug", ctx=ctx, target=target) out = intrp.evaluate(func)(np_data) tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) - # get_valid_count for opencl doesn't do data rearrangement - if target in ['opencl']: - continue tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 1df1391e4735..c9dbea7c5c99 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -25,32 +25,6 @@ from .. import tag -def cuda_atomic_add_rule(op): - if op.dtype == "float32": - return tvm.tir.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1]) - if op.dtype == "float64": - return tvm.tir.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1]) - if op.dtype == "int32": - return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) - raise RuntimeError("only support int32, float32 and float64") - -def opencl_atomic_add_rule(op): - if op.dtype == "int32": - return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) - raise RuntimeError("only support int32") - -tvm.target.intrin.register_intrin_rule( - "cuda", "atomic_add", cuda_atomic_add_rule, override=True) - -tvm.target.intrin.register_intrin_rule( - "opencl", "atomic_add", opencl_atomic_add_rule, override=True) - -tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque) - -def atomic_add(x, y): - return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) - - def get_valid_counts_ir(data, valid_count, out, out_indices, score_threshold, id_index, score_index): """Low level IR to get valid count of bounding boxes From b43f3ec0295bac005ce9e251f8c11178f2ddfe53 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Fri, 24 Jul 2020 10:22:47 +0800 Subject: [PATCH 06/18] wakey wakey GitHub Actions From 113985b25de3ac311d58489494baa399d23caf1e Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Mon, 27 Jul 2020 09:33:41 +0800 Subject: [PATCH 07/18] remove get_valid_counts single thread --- topi/python/topi/cuda/nms.py | 117 ++++++++++++++++------------------- 1 file changed, 52 insertions(+), 65 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index c9dbea7c5c99..364591da71f4 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -25,32 +25,51 @@ from .. import tag +def cuda_atomic_add_rule(op): + if op.dtype == "float32": + return tvm.tir.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1]) + if op.dtype == "float64": + return tvm.tir.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1]) + if op.dtype == "int32": + return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) + raise RuntimeError("only support int32, float32 and float64") + +def opencl_atomic_add_rule(op): + if op.dtype == "int32": + return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) + raise RuntimeError("only support int32") + +tvm.target.intrin.register_intrin_rule( + "cuda", "atomic_add", cuda_atomic_add_rule, override=True) + +tvm.target.intrin.register_intrin_rule( + "opencl", "atomic_add", opencl_atomic_add_rule, override=True) + +tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque) + +def atomic_add(x, y): + return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) + + def get_valid_counts_ir(data, valid_count, out, out_indices, score_threshold, id_index, score_index): """Low level IR to get valid count of bounding boxes given a score threshold. Also prepares to move valid boxes to the top of input data. - Parameters ---------- data : Buffer Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. - valid_count : Buffer 1D buffer for valid number of boxes with shape [batch_size, ]. - flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. - score_threshold : float32 Lower limit of score for valid bounding boxes. - id_index : optional, int index of the class categories, -1 to disable. - score_index: optional, int Index of the scores/confidence of boxes. - Returns ------- stmt : Stmt @@ -67,6 +86,8 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) out_indices = ib.buffer_ptr(out_indices) + atomic_add_return = ib.allocate( + valid_count.dtype, (1,), name='atomic_add_return', scope='local') one_count = tvm.tir.const(1, dtype=valid_count.dtype) one = tvm.tir.const(1, dtype=out.dtype) score_threshold = tvm.ir.make_node( @@ -74,60 +95,55 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index) score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index) - nthread_tx = batch_size - nthread_bx = 1 + max_threads = int(tvm.target.Target.current( + allow_none=False).max_num_threads) + nthread_tx = max_threads + nthread_bx = batch_size * num_anchors // max_threads + 1 tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) - tid = tx - - # each thread process one batch - valid_count[tid] = 0 - data_base_ind = tid * num_anchors * elem_length - ind_base_ind = tid * num_anchors - with ib.for_range(0, num_anchors) as anchor_ind: - with ib.for_range(0, elem_length) as k: - out[data_base_ind + anchor_ind * elem_length + k] = -one - out_indices[ind_base_ind + anchor_ind] = -one_count - - with ib.for_range(0, num_anchors) as anchor_ind: + tid = bx * max_threads + tx + idxd = tvm.tir.indexdiv + + # initialize valid_count + with ib.if_scope(tid < batch_size): + valid_count[tid] = 0 + with ib.if_scope(tid < batch_size * num_anchors): + i = idxd(tid, num_anchors) with ib.if_scope( - tvm.tir.all( - data[data_base_ind + anchor_ind * elem_length + score_index] > score_threshold, - tvm.tir.any(id_index < 0, - data[data_base_ind + anchor_ind * elem_length + id_index] >= 0))): - valid_count[tid] = valid_count[tid] + 1 + tvm.tir.all(data[tid * elem_length + score_index] > score_threshold, + tvm.tir.any(id_index < 0, data[tid * elem_length + id_index] >= 0))): + atomic_add_return[0] = atomic_add(tvm.tir.call_intrin("handle", "tir.address_of", + valid_count[i]), one_count) with ib.for_range(0, elem_length) as k: - out[data_base_ind + (valid_count[tid]-1) * elem_length + k] = \ - data[data_base_ind + anchor_ind * elem_length + k] - out_indices[ind_base_ind + (valid_count[tid]-1)] = anchor_ind + out[tid * elem_length + k] = data[tid * elem_length + k] + out_indices[tid + k] = tid + k + with ib.else_scope(): + with ib.for_range(0, elem_length) as k: + out[tid * elem_length + k] = -one + out_indices[tid + k] = -one_count + return ib.get() def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): """Get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. - Parameters ---------- data : tvm.te.Tensor Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length]. - score_threshold : optional, float Lower limit of score for valid bounding boxes. - id_index : optional, int index of the class categories, -1 to disable. - score_index: optional, int Index of the scores/confidence of boxes. - Returns ------- valid_count : tvm.te.Tensor 1-D tensor for valid number of boxes. - out_tensor : tvm.te.Tensor Rearranged data tensor. """ @@ -236,46 +252,33 @@ def nms_ir(data, sorted_index, valid_count, indices, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index): """Low level IR routing for transform location in multibox_detection operator. - Parameters ---------- data : Buffer Buffer of output boxes with class and score. - sort_index : Buffer Buffer of output box indexes sorted by score. - valid_count : Buffer Buffer of number of valid output boxes. - indices : Buffer Buffer represents the index of box in original data. - out : Buffer Output buffer. - max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. - iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. - force_suppress : boolean Whether to suppress all detections regardless of class_id. - top_k : int Keep maximum top k detections before nms, -1 for no limit. - coord_start : int Start index of the consecutive 4 coordinates. - id_index : int index of the class categories, -1 to disable. - score_index : optional, int Index of the scores/confidence of boxes. - Returns ------- stmt : Stmt @@ -304,7 +307,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) - indices = ib.buffer_ptr(indices) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate( @@ -411,7 +413,6 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, coord_start=2, score_index=1, id_index=0, return_indices=True, invalid_to_bottom=False): """Non-maximum suppression operator for object detection. - Parameters ---------- data : tvm.te.Tensor @@ -419,55 +420,41 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. It could be the second output out_tensor of get_valid_counts. - valid_count : tvm.te.Tensor 1-D tensor for valid number of boxes. It could be the output valid_count of get_valid_counts. - indices : tvm.te.Tensor 2-D tensor with shape [batch_size, num_anchors], represents the index of box in original data. It could be the third output out_indices of get_valid_counts. The values in the second dimension are like the output of arange(num_anchors) if get_valid_counts is not used before non_max_suppression. - max_output_size : optional, int Max number of output valid boxes for each instance. By default all valid boxes are returned. - iou_threshold : optional, float Non-maximum suppression threshold. - force_suppress : optional, boolean Whether to suppress all detections regardless of class_id. - top_k : optional, int Keep maximum top k detections before nms, -1 for no limit. - coord_start : required, int Start index of the consecutive 4 coordinates. - score_index : optional, int Index of the scores/confidence of boxes. - id_index : optional, int index of the class categories, -1 to disable. - return_indices : boolean Whether to return box indices in input data. - invalid_to_bottom : optional, boolean Whether to move all valid bounding boxes to the top. - Returns ------- out : tvm.te.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. - Example -------- .. code-block:: python - # An example to use nms dshape = (1, 5, 6) data = te.placeholder(dshape, name="data") @@ -528,4 +515,4 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, if return_indices: return rearrange_indices_out(box_indices) - return out + return out \ No newline at end of file From bb3b15d0096e3ae7147517512febeb7447969dd3 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Mon, 27 Jul 2020 09:47:24 +0800 Subject: [PATCH 08/18] only rearrangel output --- tests/python/relay/test_op_level5.py | 3 ++ topi/python/topi/cuda/nms.py | 45 ++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 328a1f27e22d..c465741b7226 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -270,6 +270,9 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): intrp = relay.create_executor("debug", ctx=ctx, target=target) out = intrp.evaluate(func)(np_data) tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) + # get_valid_count for cuda, opencl doesn't do data rearrangement + if target in ['cuda', 'opencl']: + continue tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 364591da71f4..20327ce6eba6 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -56,20 +56,27 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, """Low level IR to get valid count of bounding boxes given a score threshold. Also prepares to move valid boxes to the top of input data. + Parameters ---------- data : Buffer Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. + valid_count : Buffer 1D buffer for valid number of boxes with shape [batch_size, ]. + flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. + score_threshold : float32 Lower limit of score for valid bounding boxes. + id_index : optional, int index of the class categories, -1 to disable. + score_index: optional, int Index of the scores/confidence of boxes. + Returns ------- stmt : Stmt @@ -130,20 +137,26 @@ def get_valid_counts_ir(data, valid_count, out, out_indices, def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): """Get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. + Parameters ---------- data : tvm.te.Tensor Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length]. + score_threshold : optional, float Lower limit of score for valid bounding boxes. + id_index : optional, int index of the class categories, -1 to disable. + score_index: optional, int Index of the scores/confidence of boxes. + Returns ------- valid_count : tvm.te.Tensor 1-D tensor for valid number of boxes. + out_tensor : tvm.te.Tensor Rearranged data tensor. """ @@ -252,33 +265,46 @@ def nms_ir(data, sorted_index, valid_count, indices, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index): """Low level IR routing for transform location in multibox_detection operator. + Parameters ---------- data : Buffer Buffer of output boxes with class and score. + sort_index : Buffer Buffer of output box indexes sorted by score. + valid_count : Buffer Buffer of number of valid output boxes. + indices : Buffer Buffer represents the index of box in original data. + out : Buffer Output buffer. + max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. + iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. + force_suppress : boolean Whether to suppress all detections regardless of class_id. + top_k : int Keep maximum top k detections before nms, -1 for no limit. + coord_start : int Start index of the consecutive 4 coordinates. + id_index : int index of the class categories, -1 to disable. + score_index : optional, int Index of the scores/confidence of boxes. + Returns ------- stmt : Stmt @@ -308,6 +334,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) + indices = ib.buffer_ptr(indices) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate( "int32", (1,), name="num_valid_boxes", scope="local") @@ -413,6 +440,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, coord_start=2, score_index=1, id_index=0, return_indices=True, invalid_to_bottom=False): """Non-maximum suppression operator for object detection. + Parameters ---------- data : tvm.te.Tensor @@ -420,41 +448,55 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. It could be the second output out_tensor of get_valid_counts. + valid_count : tvm.te.Tensor 1-D tensor for valid number of boxes. It could be the output valid_count of get_valid_counts. + indices : tvm.te.Tensor 2-D tensor with shape [batch_size, num_anchors], represents the index of box in original data. It could be the third output out_indices of get_valid_counts. The values in the second dimension are like the output of arange(num_anchors) if get_valid_counts is not used before non_max_suppression. + max_output_size : optional, int Max number of output valid boxes for each instance. By default all valid boxes are returned. + iou_threshold : optional, float Non-maximum suppression threshold. + force_suppress : optional, boolean Whether to suppress all detections regardless of class_id. + top_k : optional, int Keep maximum top k detections before nms, -1 for no limit. + coord_start : required, int Start index of the consecutive 4 coordinates. + score_index : optional, int Index of the scores/confidence of boxes. + id_index : optional, int index of the class categories, -1 to disable. + return_indices : boolean Whether to return box indices in input data. + invalid_to_bottom : optional, boolean Whether to move all valid bounding boxes to the top. + Returns ------- out : tvm.te.Tensor 3-D tensor with shape [batch_size, num_anchors, elem_length]. + Example -------- .. code-block:: python + # An example to use nms dshape = (1, 5, 6) data = te.placeholder(dshape, name="data") @@ -511,8 +553,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf], name="nms", tag="nms") - # TODO(yongwww): Update cuda nms to be consistent with cpu version if return_indices: return rearrange_indices_out(box_indices) - return out \ No newline at end of file + return out From 796cc75bdd2a24feb70f187d85ed01da6071ff33 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Sat, 29 Aug 2020 10:02:40 +0800 Subject: [PATCH 09/18] fix conflict --- tests/python/relay/test_op_level5.py | 51 ++++++++------ topi/python/topi/cuda/nms.py | 101 ++------------------------- 2 files changed, 36 insertions(+), 116 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index c465741b7226..bfbef0db048b 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -23,7 +23,7 @@ from tvm import relay from tvm.relay import transform from tvm.relay.testing import ctx_list, run_infer_type -import topi.testing +import tvm.topi.testing def test_resize_infer_type(): @@ -41,19 +41,21 @@ def test_resize_infer_type(): assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8") def test_resize(): - def verify_resize(dshape, scale, method, layout): + def verify_resize(dshape, scale, method, layout, coord_trans): if layout == "NHWC": size = (dshape[1] * scale, dshape[2] * scale) else: size = (dshape[2] * scale, dshape[3] * scale) x_data = np.random.uniform(size=dshape).astype("float32") + if method == "bilinear": - ref_res = topi.testing.bilinear_resize_python(x_data, size, layout) + ref_res = tvm.topi.testing.bilinear_resize_python(x_data, size, layout, coord_trans) else: - ref_res = topi.testing.upsampling_python(x_data, (scale, scale), layout) + ref_res = tvm.topi.testing.upsampling_python(x_data, (scale, scale), layout) x = relay.var("x", relay.TensorType(dshape, "float32")) - z = relay.image.resize(x, size, layout, method, "align_corners") + z = relay.image.resize(x, size, layout, method, + coordinate_transformation_mode=coord_trans) assert "size=" in z.astext() zz = run_infer_type(z) assert zz.checked_type == relay.TensorType(ref_res.shape, "float32") @@ -63,10 +65,13 @@ def verify_resize(dshape, scale, method, layout): for kind in ["graph", "debug"]: intrp = relay.create_executor(kind, ctx=ctx, target=target) op_res = intrp.evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-6) - for method in ["bilinear", "nearest_neighbor"]: - for layout in ["NHWC", "NCHW"]: - verify_resize((1, 4, 4, 4), 2, method, layout) + tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-5) + + for layout in ["NHWC", "NCHW"]: + verify_resize((1, 4, 4, 4), 2, "bilinear", layout, "align_corners") + verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "half_pixel") + verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "asymmetric") + verify_resize((3, 4, 5, 6), 5, "nearest_neighbor", layout, "asymmetric") def test_resize3d_infer_type(): n, c, d, h, w = te.size_var("n"), te.size_var("c"), te.size_var("d"), te.size_var("h"), te.size_var("w") @@ -91,9 +96,9 @@ def verify_resize(dshape, scale, method, layout): x_data = np.random.uniform(size=dshape).astype("float32") if method == "trilinear": - ref_res = topi.testing.trilinear_resize3d_python(x_data, size, layout) + ref_res = tvm.topi.testing.trilinear_resize3d_python(x_data, size, layout) else: - ref_res = topi.testing.upsampling3d_python(x_data, (scale, scale, scale), layout) + ref_res = tvm.topi.testing.upsampling3d_python(x_data, (scale, scale, scale), layout) x = relay.var("x", relay.TensorType(dshape, "float32")) z = relay.image.resize3d(x, size, layout, method, "align_corners") assert "size=" in z.astext() @@ -116,7 +121,7 @@ def verify_crop_and_resize(img_shape, boxes, box_indices, crop_size, image_data = np.random.uniform(size=img_shape).astype("float32") - ref_res = topi.testing.crop_and_resize_python(image_data, + ref_res = tvm.topi.testing.crop_and_resize_python(image_data, boxes, box_indices, crop_size, @@ -272,7 +277,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) # get_valid_count for cuda, opencl doesn't do data rearrangement if target in ['cuda', 'opencl']: - continue + return tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04) @@ -321,6 +326,8 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) + if target == 'cuda': + return op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5) op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) @@ -461,7 +468,7 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ np_data = np.random.uniform(size=data_shape).astype("float32") np_rois = np.random.uniform(size=rois_shape).astype('float32') * in_size np_rois[:, 0] = np.random.randint(low = 0, high = batch, size = num_roi) - ref_res = topi.testing.roi_align_nchw_python(np_data, np_rois, pooled_size=pooled_size, + ref_res = tvm.topi.testing.roi_align_nchw_python(np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio) for target, ctx in ctx_list(): @@ -493,7 +500,7 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale): np_data = np.random.uniform(size=data_shape).astype("float32") np_rois = np.random.uniform(size=rois_shape).astype('float32') * in_size np_rois[:, 0] = np.random.randint(low = 0, high = batch, size = num_roi).astype('float32') - ref_res = topi.testing.roi_pool_nchw_python(np_data, np_rois, pooled_size=pooled_size, + ref_res = tvm.topi.testing.roi_pool_nchw_python(np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale) for target, ctx in ctx_list(): intrp1 = relay.create_executor("graph", ctx=ctx, target=target) @@ -588,7 +595,7 @@ def verify_yolo_reorg(shape, stride, out_shape): def test_yolo_reorg(): def verify_yolo_reorg(shape, stride): x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32") - ref_res = topi.testing.reorg_python(x_data, stride) + ref_res = tvm.topi.testing.reorg_python(x_data, stride) x = relay.var("x", relay.TensorType(shape, "float32")) z = relay.vision.yolo_reorg(x, stride=stride) @@ -656,7 +663,7 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups): data = np.random.uniform(size=data_shape).astype(dtype) offset = np.random.uniform(size=offset_shape).astype(dtype) kernel = np.random.uniform(size=kernel_shape).astype(dtype) - ref_res = topi.testing.deformable_conv2d_nchw_python(data, offset, kernel, stride=(1, 1), padding=(1, 1), dilation=(1, 1), deformable_groups=deformable_groups, groups=groups) + ref_res = tvm.topi.testing.deformable_conv2d_nchw_python(data, offset, kernel, stride=(1, 1), padding=(1, 1), dilation=(1, 1), deformable_groups=deformable_groups, groups=groups) for target, ctx in ctx_list(): for kind in ["graph", "debug"]: @@ -677,7 +684,7 @@ def verify_depth_to_space(dshape, block_size, layout, mode): x_data = np.random.uniform(size=dshape).astype("float32") if layout == "NHWC": x_data = np.transpose(x_data, axes=[0, 3, 1, 2]) - ref_res = topi.testing.depth_to_space_python(x_data, block_size, mode=mode) + ref_res = tvm.topi.testing.depth_to_space_python(x_data, block_size, mode=mode) if layout == "NHWC": x_data = np.transpose(x_data, axes=[0, 2, 3, 1]) ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1]) @@ -709,7 +716,7 @@ def verify_space_to_depth(dshape, block_size, layout): x_data = np.random.uniform(size=dshape).astype("float32") if layout == "NHWC": x_data = np.transpose(x_data, axes=[0, 3, 1, 2]) - ref_res = topi.testing.space_to_depth_python(x_data, block_size) + ref_res = tvm.topi.testing.space_to_depth_python(x_data, block_size) if layout == "NHWC": x_data = np.transpose(x_data, axes=[0, 2, 3, 1]) ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1]) @@ -848,7 +855,7 @@ def verify_affine_grid(num_batch, target_shape): func = relay.Function([data], y) data_np = np.random.uniform(size=data_shape).astype(dtype) - ref_res = topi.testing.affine_grid_python(data_np, target_shape) + ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape) for target, ctx in ctx_list(): for kind in ["graph", "debug"]: @@ -874,7 +881,7 @@ def verify_grid_sample(data_shape, grid_shape): data_np = np.random.uniform(size=data_shape).astype(dtype) grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype) - ref_res = topi.testing.grid_sample_nchw_python(data_np, grid_np, method='bilinear') + ref_res = tvm.topi.testing.grid_sample_nchw_python(data_np, grid_np, method='bilinear') for target, ctx in ctx_list(): for kind in ["graph", "debug"]: @@ -908,4 +915,4 @@ def verify_grid_sample(data_shape, grid_shape): test_dilation2d_infer_type() test_dilation2d_run() test_affine_grid() - test_grid_sample() + test_grid_sample() \ No newline at end of file diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 20327ce6eba6..98702eff3326 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -184,84 +184,7 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): return [valid_count, out, out_indices] -def rearrange_indices_out_ir(data, output, valid_box_count): - """Low level IR to get rearrange_indices_out. - Parameters - ---------- - data : Buffer - Input data. 2-D Buffer with shape [batch_size, num_anchors]. - - output: Buffer - 2-D Buffer with shape [batch_size, num_anchors]. - - valid_box_count : Buffer - 2-D Buffer with shape [batch_size, 1]. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch_size = data.shape[0] - num_anchors = data.shape[1] - ib = tvm.tir.ir_builder.create() - - data = ib.buffer_ptr(data) - output = ib.buffer_ptr(output) - valid_box_count = ib.buffer_ptr(valid_box_count) - - nthread_tx = batch_size - nthread_bx = 1 - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) - tid = tx - - valid_box_count[tid] = 0 - with ib.for_range(0, num_anchors) as anchor_ind: - output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind] - return ib.get() - - -def rearrange_indices_out(data): - """Rearrange nms output to move all valid entries to top. - - Parameters - ---------- - data : tvm.te.Tensor or numpy NDArray - NMS output. 2-D - tensor with shape [batch_size, num_anchors]. - - Returns - ------- - output : tvm.te.Tensor or numpy NDArray - 2-D tensor with shape [batch_size, num_anchors]. - - valid_box_count : tvm.te.Tensor or numpy NDArray - Tensor with shape [batch_size, 1], indicates - the valid number of boxes. - """ - batch_size = data.shape[0] - data_buf = tvm.tir.decl_buffer( - data.shape, data.dtype, "data_buf", data_alignment=8) - out_indices_buf = tvm.tir.decl_buffer( - data.shape, data.dtype, "out_indices_buf", data_alignment=8) - valid_count_buf = tvm.tir.decl_buffer( - (batch_size, 1), "int32", "valid_count_buf", data_alignment=8) - - output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape], - [data], - lambda ins, outs: rearrange_indices_out_ir( - ins[0], outs[0], outs[1]), - in_buffers=[data_buf], - out_buffers=[out_indices_buf, valid_count_buf], - name="rearrange_indices_out", - tag="rearrange_indices_out_gpu") - return [output, valid_box_count] - - -def nms_ir(data, sorted_index, valid_count, indices, out, box_indices, +def nms_ir(data, sorted_index, valid_count, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index): """Low level IR routing for transform location in multibox_detection operator. @@ -277,9 +200,6 @@ def nms_ir(data, sorted_index, valid_count, indices, out, box_indices, valid_count : Buffer Buffer of number of valid output boxes. - indices : Buffer - Buffer represents the index of box in original data. - out : Buffer Output buffer. @@ -334,7 +254,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) - indices = ib.buffer_ptr(indices) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate( "int32", (1,), name="num_valid_boxes", scope="local") @@ -427,10 +346,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): box_indices[i * num_anchors + j] = -1 with ib.else_scope(): num_valid_boxes[0] += 1 - # convert box_indices to represent original data - org_idx = box_indices[i * num_anchors + j] - with ib.if_scope(box_indices[i * num_anchors + j] >= 0): - box_indices[i * num_anchors + j] = indices[i * num_anchors + org_idx] return ib.get() @@ -539,21 +454,19 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, data_buf = tvm.tir.decl_buffer( data.shape, data.dtype, "data_buf", data_alignment=8) - indices_buf = tvm.tir.decl_buffer( - indices.shape, indices.dtype, "indices_buf", data_alignment=8) - out, box_indices = \ te.extern([data.shape, score_shape], - [data, sort_tensor, valid_count, indices], + [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( - ins[0], ins[1], ins[2], ins[3], outs[0], outs[1], + ins[0], ins[1], ins[2], outs[0], outs[1], max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index), dtype=[data.dtype, "int32"], - in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf], + in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], name="nms", tag="nms") + # TODO(yongwww): Update cuda nms to be consistent with cpu version if return_indices: - return rearrange_indices_out(box_indices) + return box_indices - return out + return out \ No newline at end of file From 9f300ac2bb4d420de0c7a623eb1784a5dc5858da Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Sun, 30 Aug 2020 14:54:29 +0800 Subject: [PATCH 10/18] add rearrange_indices_out --- python/tvm/topi/cuda/nms.py | 85 ++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 98702eff3326..dc61826dc2e5 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -184,6 +184,88 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): return [valid_count, out, out_indices] +def rearrange_indices_out_ir(data, output, valid_box_count): + """Low level IR to get rearrange_indices_out. + Parameters + ---------- + data : Buffer + Input data. 2-D Buffer with shape [batch_size, num_anchors]. + + output: Buffer + 2-D Buffer with shape [batch_size, num_anchors]. + + valid_box_count : Buffer + 2-D Buffer with shape [batch_size, 1]. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + batch_size = data.shape[0] + num_anchors = data.shape[1] + ib = tvm.tir.ir_builder.create() + + data = ib.buffer_ptr(data) + output = ib.buffer_ptr(output) + valid_box_count = ib.buffer_ptr(valid_box_count) + + nthread_tx = batch_size + nthread_bx = 1 + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = tx + + neg_one = tvm.tir.const(-1, dtype=output.dtype) + valid_box_count[tid] = 0 + with ib.for_range(0, num_anchors) as anchor_ind: + output[tid * num_anchors + anchor_ind] = neg_one + with ib.for_range(0, num_anchors) as anchor_ind: + with ib.if_scope(data[tid * num_anchors + anchor_ind] >= 0): + output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind] + valid_box_count[tid] = valid_box_count[tid] + 1 + return ib.get() + + +def rearrange_indices_out(data): + """Rearrange nms output to move all valid entries to top. + + Parameters + ---------- + data : tvm.te.Tensor or numpy NDArray + NMS output. 2-D + tensor with shape [batch_size, num_anchors]. + + Returns + ------- + output : tvm.te.Tensor or numpy NDArray + 2-D tensor with shape [batch_size, num_anchors]. + + valid_box_count : tvm.te.Tensor or numpy NDArray + Tensor with shape [batch_size, 1], indicates + the valid number of boxes. + """ + batch_size = data.shape[0] + data_buf = tvm.tir.decl_buffer( + data.shape, data.dtype, "data_buf", data_alignment=8) + out_indices_buf = tvm.tir.decl_buffer( + data.shape, data.dtype, "out_indices_buf", data_alignment=8) + valid_count_buf = tvm.tir.decl_buffer( + (batch_size, 1), "int32", "valid_count_buf", data_alignment=8) + + output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape], + [data], + lambda ins, outs: rearrange_indices_out_ir( + ins[0], outs[0], outs[1]), + in_buffers=[data_buf], + out_buffers=[out_indices_buf, valid_count_buf], + name="rearrange_indices_out", + tag="rearrange_indices_out_gpu") + return [output, valid_box_count] + + def nms_ir(data, sorted_index, valid_count, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index): @@ -465,8 +547,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], name="nms", tag="nms") - # TODO(yongwww): Update cuda nms to be consistent with cpu version if return_indices: - return box_indices + return rearrange_indices_out(box_indices) return out \ No newline at end of file From 572fbc6c57dbce609674831f0cdcfe26de7d24ac Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Sun, 30 Aug 2020 15:12:09 +0800 Subject: [PATCH 11/18] fix sanity check --- python/tvm/topi/cuda/nms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index dc61826dc2e5..9d58a6f0399a 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -218,14 +218,14 @@ def rearrange_indices_out_ir(data, output, valid_box_count): ib.scope_attr(bx, "thread_extent", nthread_bx) tid = tx - neg_one = tvm.tir.const(-1, dtype=output.dtype) + neg_one = tvm.tir.const(-1, dtype=output.dtype) valid_box_count[tid] = 0 with ib.for_range(0, num_anchors) as anchor_ind: output[tid * num_anchors + anchor_ind] = neg_one with ib.for_range(0, num_anchors) as anchor_ind: with ib.if_scope(data[tid * num_anchors + anchor_ind] >= 0): output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind] - valid_box_count[tid] = valid_box_count[tid] + 1 + valid_box_count[tid] = valid_box_count[tid] + 1 return ib.get() @@ -550,4 +550,4 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1, if return_indices: return rearrange_indices_out(box_indices) - return out \ No newline at end of file + return out From b05f37dd17b3c9a5473cd1601331ece5c381a6f9 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Sun, 30 Aug 2020 15:14:26 +0800 Subject: [PATCH 12/18] add new line --- tests/python/relay/test_op_level5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index bfbef0db048b..254bab5e1692 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -915,4 +915,4 @@ def verify_grid_sample(data_shape, grid_shape): test_dilation2d_infer_type() test_dilation2d_run() test_affine_grid() - test_grid_sample() \ No newline at end of file + test_grid_sample() From 84b9d737f249b8695eb6bda37158e4a0efcc869c Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Wed, 9 Sep 2020 10:56:34 +0800 Subject: [PATCH 13/18] debuging --- python/tvm/relay/backend/compile_engine.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index f60335a4d44b..4f883ac7b870 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -121,6 +121,9 @@ def get_valid_implementations(op, attrs, inputs, out_type, target): assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name with target: strategy = fstrategy(attrs, inputs, out_type, target) + print('strategy.specializations: ', strategy.specializations) + for spec in strategy.specializations: + print(spec.implementations[0].name) analyzer = tvm.arith.Analyzer() ret = [] for spec in strategy.specializations: @@ -179,6 +182,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True) ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor]) The best op implementation and the corresponding output tensors. """ + print('target: ', target) all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) best_plevel_impl = max(all_impls, key=lambda x: x.plevel) From 1f5a0e0767ead8902c1fdce92952244be6e4c58b Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Wed, 9 Sep 2020 10:58:24 +0800 Subject: [PATCH 14/18] sync with remote --- 3rdparty/vta-hw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index 9b178fdb387b..db65157208ec 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit 9b178fdb387bffc708f2448a82e85b4737239aed +Subproject commit db65157208ec8fabb7b548c94596211b9db04190 From 88f7867aea85ce3f3779eff0046ff8ef898ec3c4 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Fri, 11 Sep 2020 10:26:41 +0800 Subject: [PATCH 15/18] add nms test for gpu --- python/tvm/relay/backend/compile_engine.py | 15 +++--- tests/python/relay/test_op_level5.py | 48 +++++++++++++++++++- tests/python/topi/python/test_topi_vision.py | 8 +--- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 987d5ee85089..41be47b78fd0 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -24,7 +24,7 @@ from tvm import te from tvm.runtime import Object from tvm.support import libinfo -from ... import target as _target +from ...target import Target from ... import autotvm from .. import function as _function from .. import ty as _ty @@ -33,9 +33,11 @@ logger = logging.getLogger('compile_engine') autotvm_logger = logging.getLogger('autotvm') + @tvm._ffi.register_object("relay.LoweredOutput") class LoweredOutput(Object): """Lowered output""" + def __init__(self, outputs, implement): self.__init_handle_by_constructor__( _backend._make_LoweredOutput, outputs, implement) @@ -53,6 +55,7 @@ class CCacheKey(Object): target : tvm.Target The target we want to run the function on. """ + def __init__(self, source_func, target): self.__init_handle_by_constructor__( _backend._make_CCacheKey, source_func, target) @@ -67,7 +70,7 @@ class CCacheValue(Object): def _get_cache_key(source_func, target): if isinstance(source_func, _function.Function): if isinstance(target, str): - target = _target.create(target) + target = Target(target) if not target: raise ValueError("Need target when source_func is a Function") return CCacheKey(source_func, target) @@ -125,9 +128,6 @@ def get_valid_implementations(op, attrs, inputs, out_type, target): assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name with target: strategy = fstrategy(attrs, inputs, out_type, target) - print('strategy.specializations: ', strategy.specializations) - for spec in strategy.specializations: - print(spec.implementations[0].name) analyzer = tvm.arith.Analyzer() ret = [] for spec in strategy.specializations: @@ -186,7 +186,6 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True) ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor]) The best op implementation and the corresponding output tensors. """ - print('target: ', target) all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) best_plevel_impl = max(all_impls, key=lambda x: x.plevel) @@ -267,7 +266,8 @@ def lower_call(call, inputs, target): new_fields = [] for field in ret_type.fields: if isinstance(field, _ty.TensorType): - new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype)) + new_fields.append(_ty.TensorType( + get_shape(field.shape), field.dtype)) else: new_fields.append(field) ret_type = _ty.TupleType(new_fields) @@ -303,6 +303,7 @@ def lower_call(call, inputs, target): class CompileEngine(Object): """CompileEngine to get lowered code. """ + def __init__(self): raise RuntimeError("Cannot construct a CompileEngine") diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 25e9ac0ce5a8..b5821c884bfa 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -333,7 +333,7 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) if target == 'cuda': - return + continue op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5) op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) @@ -372,6 +372,51 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, np_indices_result, top_k=2) +@tvm.testing.uses_gpu +def test_non_max_suppression_gpu(): + def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, + ref_indices_res, iou_threshold=0.5, force_suppress=True, + top_k=-1, check_type_only=False): + x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32")) + x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32")) + x2 = relay.var("x2", relay.ty.TensorType((dshape[0], dshape[1]), "int32")) + x3 = relay.var("x3", relay.ty.TensorType((), "int32")) + z_indices = relay.vision.non_max_suppression(x0, x1, x2, x3, \ + iou_threshold=iou_threshold, force_suppress=force_suppress, \ + top_k=top_k, return_indices=True) + if isinstance(z_indices, relay.expr.TupleWrapper): + z_indices = z_indices.astuple() + zz_indices = run_infer_type(z_indices) + + func_indices = relay.Function([x0, x1, x2, x3], z_indices) + func_indices = run_infer_type(func_indices) + for target, ctx in ctx_list(): + if target != 'cuda': + continue + intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) + print('op_indices_res1[0]: \n', op_indices_res1[0]) + print('op_indices_res1[1]: \n', op_indices_res1[1]) + + # data after get_valid_counts + np_data = np.array([[[0, 0.8, 1, 20, 25, 45], + [1, 0.7, 2, 21, 26, 45], + [-1, -1, -1, -1, -1, -1], + [2, 0.9, 35, 61, 52, 79], + [1, 0.5, 100, 60, 70, 110]]]).astype("float32") + np_indices = np.array([[0, 1, -1, 3, 4]]).astype("int32") + np_valid_count = np.array([4]).astype("int32") + np_max_output_size = -1 + num_anchors = 5 + dshape = (1, num_anchors, 6) + np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45], + [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, -1]]]) + np_indices_result = np.array([[3, 0, -1, -1, -1]]) + verify_nms(np_data, np_valid_count, np_indices, + np_max_output_size, dshape, np_result, + np_indices_result) + @tvm.testing.uses_gpu def test_multibox_transform_loc(): def test_default_value(): @@ -926,6 +971,7 @@ def verify_grid_sample(data_shape, grid_shape): test_yolo_reorg_infer_shape() test_yolo_reorg() test_non_max_suppression() + test_non_max_suppression_gpu() test_deformable_conv2d() test_depth_to_space() test_space_to_depth() diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py index 691dcdfaf926..a8aef452090a 100644 --- a/tests/python/topi/python/test_topi_vision.py +++ b/tests/python/topi/python/test_topi_vision.py @@ -167,12 +167,8 @@ def check_device(device): tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx) - if device == 'llvm': - f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device) - f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) - else: - f = tvm.build(indices_s, [data, valid_count, indices, indices_out], device) - f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) + f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device) + f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4) for device in ['llvm', 'cuda', 'opencl']: From baa80344e703444e8c6085bfab88caf6acb7612d Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Fri, 11 Sep 2020 11:39:57 +0800 Subject: [PATCH 16/18] fix test --- tests/python/relay/test_op_level5.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index c2564e4e4577..a4eb00e7c3cb 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -390,13 +390,15 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, func_indices = relay.Function([x0, x1, x2, x3], z_indices) func_indices = run_infer_type(func_indices) - for target, ctx in ctx_list(): + for target, ctx in tvm.testing.enabled_targets(): if target != 'cuda': continue intrp1 = relay.create_executor("graph", ctx=ctx, target=target) op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) - print('op_indices_res1[0]: \n', op_indices_res1[0]) - print('op_indices_res1[1]: \n', op_indices_res1[1]) + op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) + tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5) + op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) + tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5) # data after get_valid_counts np_data = np.array([[[0, 0.8, 1, 20, 25, 45], From fb2ccaccfdf85a8115240fa45d2bacf67ebf73bc Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Mon, 14 Sep 2020 20:11:26 +0800 Subject: [PATCH 17/18] sync with upstream --- python/tvm/topi/cuda/nms.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index b070d939895a..4f868ad880da 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -194,7 +194,6 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): return [valid_count, out, out_indices] -<<<<<<< HEAD def rearrange_indices_out_ir(data, output, valid_box_count): """Low level IR to get rearrange_indices_out. Parameters @@ -277,10 +276,6 @@ def rearrange_indices_out(data): return [output, valid_box_count] -def nms_ir(data, sorted_index, valid_count, out, box_indices, - max_output_size, iou_threshold, force_suppress, - top_k, coord_start, id_index, score_index): -======= def nms_ir( data, sorted_index, @@ -295,7 +290,6 @@ def nms_ir( id_index, score_index, ): ->>>>>>> upstream/master """Low level IR routing for transform location in multibox_detection operator. Parameters From 8f69efc09018cac769ac11c890bbc61878ffc546 Mon Sep 17 00:00:00 2001 From: lisiyuan Date: Tue, 15 Sep 2020 08:57:13 +0800 Subject: [PATCH 18/18] fix nms gpu test --- tests/python/relay/test_op_level5.py | 59 +++++++++++++++++++++------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 54d5ee23855f..2dd42b84df26 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -495,9 +495,19 @@ def verify_nms( @tvm.testing.uses_gpu def test_non_max_suppression_gpu(): - def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, - ref_indices_res, iou_threshold=0.5, force_suppress=True, - top_k=-1, check_type_only=False): + def verify_nms( + x0_data, + x1_data, + x2_data, + x3_data, + dshape, + ref_res, + ref_indices_res, + iou_threshold=0.5, + force_suppress=True, + top_k=-1, + check_type_only=False + ): x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32")) x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32")) x2 = relay.var("x2", relay.ty.TensorType((dshape[0], dshape[1]), "int32")) @@ -522,23 +532,44 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res, tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5) # data after get_valid_counts - np_data = np.array([[[0, 0.8, 1, 20, 25, 45], - [1, 0.7, 2, 21, 26, 45], - [-1, -1, -1, -1, -1, -1], - [2, 0.9, 35, 61, 52, 79], - [1, 0.5, 100, 60, 70, 110]]]).astype("float32") + np_data = np.array( + [ + [ + [0, 0.8, 1, 20, 25, 45], + [1, 0.7, 2, 21, 26, 45], + [-1, -1, -1, -1, -1, -1], + [2, 0.9, 35, 61, 52, 79], + [1, 0.5, 100, 60, 70, 110] + ] + ] + ).astype("float32") np_indices = np.array([[0, 1, -1, 3, 4]]).astype("int32") np_valid_count = np.array([4]).astype("int32") np_max_output_size = -1 num_anchors = 5 dshape = (1, num_anchors, 6) - np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45], - [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1], - [-1, -1, -1, -1, -1, -1]]]) + np_result = np.array( + [ + [ + [2, 0.9, 35, 61, 52, 79], + [0, 0.8, 1, 20, 25, 45], + [-1, -1, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, -1] + ] + ] + ) np_indices_result = np.array([[3, 0, -1, -1, -1]]) - verify_nms(np_data, np_valid_count, np_indices, - np_max_output_size, dshape, np_result, - np_indices_result) + verify_nms( + np_data, + np_valid_count, + np_indices, + np_max_output_size, + dshape, + np_result, + np_indices_result, + top_k=2, + ) @tvm.testing.uses_gpu def test_multibox_transform_loc():