From f8ef7f725bddb8631f9c6363177886554d0d5a66 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Mon, 30 Nov 2020 23:26:53 +0000 Subject: [PATCH 1/8] Add rearrange_indices --- python/tvm/topi/cuda/nms.py | 56 +++++++++++++++++++++++++++++++++- tests/python/relay/test_any.py | 41 ++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index ed6e8f086a0d..48e7ba382464 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -53,6 +53,51 @@ def opencl_atomic_add_rule(op): def atomic_add(x, y): return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) +def rearrange_indices_out_ir(data, out, valid_box_count): + batch_size = data.shape[0] + num_anchors = data.shape[1] + + ib = tvm.tir.ir_builder.create() + data = ib.buffer_ptr(data) + out = ib.buffer_ptr(out) + valid_box_count = ib.buffer_ptr(valid_box_count) + + one_count = tvm.tir.const(1, dtype="int32") + atomic_add_return = ib.allocate( + valid_box_count.dtype, (batch_size,), name="atomic_add_return", scope="local" + ) + + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + nthread_tx = max_threads + tx = te.thread_axis("threadIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + len_inner_for = (batch_size * num_anchors) // nthread_tx + 1 + + idxd = tvm.tir.indexdiv + idxm = tvm.tir.indexmod + + with ib.for_range(0, len_inner_for, name="i") as i: + idx = tx * len_inner_for + i + batch_idx = idxd(idx, num_anchors) + with ib.if_scope(idx < batch_size): + valid_box_count[idx] = 0 + atomic_add_return[idx] = 0 + with ib.if_scope(idx < batch_size * num_anchors): + with ib.if_scope(data[idx] >= 0): + out[batch_idx * num_anchors + valid_box_count[batch_idx]] = data[idx] + atomic_add_return[batch_idx] = atomic_add( + tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count + ) + with ib.if_scope(tvm.tir.any(data[idx] > num_anchors, data[idx] < -num_anchors)): + out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0.0 + atomic_add_return[batch_idx] = atomic_add( + tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count + ) + with ib.if_scope(idxm(idx, num_anchors) >= atomic_add_return[batch_idx]): + out[idx] = -1.0 + + return ib.get() + def get_valid_counts_ir( data, valid_count, out, out_indices, score_threshold, id_index, score_index @@ -527,6 +572,15 @@ def non_max_suppression( ) # TODO(yongwww): Update cuda nms to be consistent with cpu version if return_indices: - return box_indices + out_buf = tvm.tir.decl_buffer(out.shape, out.dtype, "out_buf", data_alignment=8) + return te.extern( + [out.shape, valid_count.shape], + [out], + lambda ins, outs: rearrange_indices_out_ir(ins[0], outs[0], outs[1]), + dtype=[out.dtype, valid_count.dtype], + in_buffers=[out_buf], + name="rearrange_indices_out", + tag="rearrange_indices_out", + ) return out diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index 546973704fea..4af0808f2542 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -25,6 +25,10 @@ from utils.assert_diagnostic import DiagnosticTesting import tvm.topi.testing +import os +#print(os.getpid()) +#input("dummy breakpoint") + def int32(val): return relay.const(val, "int32") @@ -1369,6 +1373,41 @@ def test_any_where(): any_dims(2), any_dims(2), any_dims(2), (3, 4), (3, 1), (1, 4), y_np_shape_invalid=(2, 4) ) +@tvm.testing.uses_gpu +def test_non_max_suppression(): + dshape = (te.size_var("n"), 5, 6) + x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 5), "float32")) + x1 = relay.var("x1", relay.ty.TensorType((1,), "int32")) + x2 = relay.var("x2", relay.ty.TensorType((1, relay.Any()), "int32")) + x3 = relay.var("x3", relay.ty.TensorType((), "int32")) + z = relay.vision.non_max_suppression( + x0, + x1, + x2, + x3, + iou_threshold=0.6, + force_suppress=True, + top_k=-1, + coord_start=1, + score_index=0, + id_index=-1, + return_indices=True, + invalid_to_bottom=False + ) + z = z.astuple() + func = relay.Function([x0, x1, x2, x3], z) + mod = tvm.IRModule() + mod["main"] = func + mod = tvm.relay.transform.InferType()(mod) + print(mod) + with tvm.transform.PassContext( + opt_level=3, + disabled_pass=["FoldScaleAxis"], + ): + vm_exec = relay.vm.compile(mod, target='cuda', params=dict()) + code, lib = vm_exec.save() + lib.export_library('/tmp/my_lib.so') + if __name__ == "__main__": - pytest.main([__file__]) + test_non_max_suppression() From 61e02e14f0822a8dfcd9cfa241db4b4888a53480 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Tue, 1 Dec 2020 01:38:01 +0000 Subject: [PATCH 2/8] Fix output type --- python/tvm/topi/cuda/nms.py | 23 ++++--- tests/python/relay/test_any.py | 108 +++++++++++++++++++++------------ 2 files changed, 83 insertions(+), 48 deletions(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 48e7ba382464..30718aad700e 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -53,6 +53,7 @@ def opencl_atomic_add_rule(op): def atomic_add(x, y): return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) + def rearrange_indices_out_ir(data, out, valid_box_count): batch_size = data.shape[0] num_anchors = data.shape[1] @@ -86,15 +87,17 @@ def rearrange_indices_out_ir(data, out, valid_box_count): with ib.if_scope(data[idx] >= 0): out[batch_idx * num_anchors + valid_box_count[batch_idx]] = data[idx] atomic_add_return[batch_idx] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count + tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), + one_count, ) with ib.if_scope(tvm.tir.any(data[idx] > num_anchors, data[idx] < -num_anchors)): - out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0.0 + out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0 atomic_add_return[batch_idx] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count + tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), + one_count, ) with ib.if_scope(idxm(idx, num_anchors) >= atomic_add_return[batch_idx]): - out[idx] = -1.0 + out[idx] = -1 return ib.get() @@ -570,14 +573,16 @@ def non_max_suppression( name="nms", tag="nms", ) - # TODO(yongwww): Update cuda nms to be consistent with cpu version + if return_indices: - out_buf = tvm.tir.decl_buffer(out.shape, out.dtype, "out_buf", data_alignment=8) + out_buf = tvm.tir.decl_buffer( + box_indices.shape, box_indices.dtype, "out_buf", data_alignment=8 + ) return te.extern( - [out.shape, valid_count.shape], - [out], + [box_indices.shape, (batch_size, 1)], + [box_indices], lambda ins, outs: rearrange_indices_out_ir(ins[0], outs[0], outs[1]), - dtype=[out.dtype, valid_count.dtype], + dtype=[box_indices.dtype, valid_count.dtype], in_buffers=[out_buf], name="rearrange_indices_out", tag="rearrange_indices_out", diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index 4af0808f2542..3e53c2af1853 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -26,8 +26,9 @@ import tvm.topi.testing import os -#print(os.getpid()) -#input("dummy breakpoint") + +# print(os.getpid()) +# input("dummy breakpoint") def int32(val): @@ -42,27 +43,43 @@ def any_dims(ndim): def check_result( - args, mod, expected, flatten=False, assert_shape=False, only_vm=False, targets=None + args, + mod, + expected, + flatten=False, + assert_shape=False, + only_vm=False, + targets=None, + disable_targets=None, ): + if not isinstance(expected, list): + expected = [expected] for kind in ["debug", "vm"]: targets = targets or tvm.testing.enabled_targets() for tgt, ctx in targets: + if disable_targets and tgt in disable_targets: + continue if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type): continue ex = relay.create_executor(kind, mod=mod, ctx=ctx, target=tgt) result = ex.evaluate()(*args) - result = result.asnumpy() - if assert_shape: - assert result.shape == expected, "Shape mismatch: expect %s but got %s." % ( - str(expected), - str(result.shape), - ) - return + if isinstance(result, tvm.runtime.container.ADT): + result = [r.asnumpy() for r in result] + else: + result = [result.asnumpy()] + + for r, e in zip(result, expected): + if assert_shape: + assert r.shape == e, "Shape mismatch: expect %s but got %s." % ( + str(e), + str(r), + ) + return - if flatten: - result = result.flatten() - expected = expected.flatten() - tvm.testing.assert_allclose(result, expected, atol=2e-6) + if flatten: + r = r.flatten() + e = e.flatten() + tvm.testing.assert_allclose(r, e, atol=2e-6) def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op): @@ -1373,41 +1390,54 @@ def test_any_where(): any_dims(2), any_dims(2), any_dims(2), (3, 4), (3, 1), (1, 4), y_np_shape_invalid=(2, 4) ) + @tvm.testing.uses_gpu def test_non_max_suppression(): - dshape = (te.size_var("n"), 5, 6) - x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 5), "float32")) + x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 6), "float32")) x1 = relay.var("x1", relay.ty.TensorType((1,), "int32")) x2 = relay.var("x2", relay.ty.TensorType((1, relay.Any()), "int32")) x3 = relay.var("x3", relay.ty.TensorType((), "int32")) z = relay.vision.non_max_suppression( - x0, - x1, - x2, - x3, - iou_threshold=0.6, - force_suppress=True, - top_k=-1, - coord_start=1, - score_index=0, - id_index=-1, - return_indices=True, - invalid_to_bottom=False - ) + x0, + x1, + x2, + x3, + iou_threshold=0.5, + force_suppress=True, + top_k=2, + return_indices=True, + invalid_to_bottom=False, + ) z = z.astuple() func = relay.Function([x0, x1, x2, x3], z) mod = tvm.IRModule() mod["main"] = func - mod = tvm.relay.transform.InferType()(mod) - print(mod) - with tvm.transform.PassContext( - opt_level=3, - disabled_pass=["FoldScaleAxis"], - ): - vm_exec = relay.vm.compile(mod, target='cuda', params=dict()) - code, lib = vm_exec.save() - lib.export_library('/tmp/my_lib.so') + + np_data = np.array( + [ + [ + [0, 0.8, 1, 20, 25, 45], + [1, 0.7, 30, 60, 50, 80], + [0, 0.4, 4, 21, 19, 40], + [2, 0.9, 35, 61, 52, 79], + [1, 0.5, 100, 60, 70, 110], + ] + ] + ).astype("float32") + np_valid_count = np.array([4]).astype("int32") + np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32") + np_max_output_size = -1 + np_indices_result = np.array([[4, 0, -1, -1, -1]]) + np_valid_box_count = np.array([[2]]).astype("int32") + + check_result( + [np_data, np_valid_count, np_indices, np_max_output_size], + mod, + [np_indices_result, np_valid_box_count], + only_vm=False, + disable_targets=["nvptx"], + ) if __name__ == "__main__": - test_non_max_suppression() + pytest.main([__file__]) From 7483c746ac2aa72dc63e9c2ef3e5a8b9ebe29cfb Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Tue, 1 Dec 2020 01:41:29 +0000 Subject: [PATCH 3/8] Clean test --- tests/python/relay/test_any.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index 3e53c2af1853..f290745e415d 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -27,9 +27,6 @@ import os -# print(os.getpid()) -# input("dummy breakpoint") - def int32(val): return relay.const(val, "int32") From ced8c7b14b5faab126ea77744e7b42fb5d2a4a59 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Tue, 1 Dec 2020 01:59:07 +0000 Subject: [PATCH 4/8] Fix pylint --- python/tvm/topi/cuda/nms.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 30718aad700e..53ef9d44ff33 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -55,6 +55,20 @@ def atomic_add(x, y): def rearrange_indices_out_ir(data, out, valid_box_count): + """Hybrid routine to rearrange nms output to + move all valid entries to top. + + Parameters + ---------- + data : tvm.te.Tensor or numpy NDArray + tensor with shape [batch_size, num_anchors]. + + + Returns + ------- + stmt : Stmt + The result IR statement. + """ batch_size = data.shape[0] num_anchors = data.shape[1] From 30c0177d6a4e1744535337543692a161040a255e Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Tue, 1 Dec 2020 19:01:20 +0000 Subject: [PATCH 5/8] Fix CPU nms multi-batch --- python/tvm/topi/vision/nms.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py index 76e1808698e5..b076fde9ac6e 100644 --- a/python/tvm/topi/vision/nms.py +++ b/python/tvm/topi/vision/nms.py @@ -52,15 +52,16 @@ def hybrid_rearrange_box_out(data, one, batch_size, num_anchors): """ elem_length = data.shape[2] output = output_tensor((batch_size, num_anchors, elem_length), data.dtype) + valid_indices = allocate((batch_size,), "int32") for i in parallel(batch_size): - valid_idx = 0 + valid_indices[i] = 0 for j in range(num_anchors): if data[i, j, 0] >= 0: for k in range(elem_length): - output[i, valid_idx, k] = data[i, j, k] - valid_idx += 1 - if j >= valid_idx: + output[i, valid_indices[i], k] = data[i, j, k] + valid_indices[i] += 1 + if j >= valid_indices[i]: for k in range(elem_length): output[i, j, k] = -one return output @@ -100,19 +101,20 @@ def hybrid_rearrange_indices_out(data, one, batch_size, num_anchors): """ valid_box_count = output_tensor((batch_size, 1), "int32") output = output_tensor((batch_size, num_anchors), data.dtype) + valid_indices = allocate((batch_size,), "int32") for i in parallel(batch_size): - valid_idx = 0 + valid_indices[i] = 0 for j in range(num_anchors): if data[i, j] >= 0: - output[i, valid_idx] = data[i, j] - valid_idx += 1 + output[i, valid_indices[i]] = data[i, j] + valid_indices[i] += 1 if data[i, j] > num_anchors or data[i, j] < -num_anchors: - output[i, valid_idx] = 0 - valid_idx += 1 - if j >= valid_idx: + output[i, valid_indices[i]] = 0 + valid_indices[i] += 1 + if j >= valid_indices[i]: output[i, j] = -one - valid_box_count[i, 0] = valid_idx + valid_box_count[i, 0] = valid_indices[i] return output, valid_box_count From e163cf88848bf0a6e126b2073062c7b6a6b5e08f Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Thu, 3 Dec 2020 21:56:52 +0000 Subject: [PATCH 6/8] Diable test --- tests/python/relay/test_any.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index f290745e415d..3be5d51f5ee4 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -1388,7 +1388,8 @@ def test_any_where(): ) -@tvm.testing.uses_gpu +# TODO(kevinthesun): enable gpu test when Thrust is available in ci. +# @tvm.testing.uses_gpu def test_non_max_suppression(): x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 6), "float32")) x1 = relay.var("x1", relay.ty.TensorType((1,), "int32")) From 496c4b2c381e3cd612ebf88cfba0323dc4e448a8 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Fri, 4 Dec 2020 22:50:22 +0000 Subject: [PATCH 7/8] Minor fix --- python/tvm/topi/cuda/nms.py | 26 ++++++++++++++------ tests/python/relay/test_any.py | 2 +- tests/python/relay/test_op_level5.py | 4 +-- tests/python/topi/python/test_topi_vision.py | 2 +- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 53ef9d44ff33..7c7c80d1e5ce 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -86,7 +86,7 @@ def rearrange_indices_out_ir(data, out, valid_box_count): nthread_tx = max_threads tx = te.thread_axis("threadIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) - len_inner_for = (batch_size * num_anchors) // nthread_tx + 1 + len_inner_for = (batch_size * num_anchors) // nthread_tx + 2 idxd = tvm.tir.indexdiv idxm = tvm.tir.indexmod @@ -96,21 +96,21 @@ def rearrange_indices_out_ir(data, out, valid_box_count): batch_idx = idxd(idx, num_anchors) with ib.if_scope(idx < batch_size): valid_box_count[idx] = 0 - atomic_add_return[idx] = 0 with ib.if_scope(idx < batch_size * num_anchors): with ib.if_scope(data[idx] >= 0): - out[batch_idx * num_anchors + valid_box_count[batch_idx]] = data[idx] atomic_add_return[batch_idx] = atomic_add( tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count, ) + out[batch_idx * num_anchors + atomic_add_return[batch_idx]] = data[idx] with ib.if_scope(tvm.tir.any(data[idx] > num_anchors, data[idx] < -num_anchors)): - out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0 atomic_add_return[batch_idx] = atomic_add( tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count, ) - with ib.if_scope(idxm(idx, num_anchors) >= atomic_add_return[batch_idx]): + out[batch_idx * num_anchors + atomic_add_return[batch_idx]] = 0 + + with ib.if_scope(idxm(idx, num_anchors) >= valid_box_count[batch_idx]): out[idx] = -1 return ib.get() @@ -260,6 +260,7 @@ def nms_ir( data, sorted_index, valid_count, + indices, out, box_indices, max_output_size, @@ -269,6 +270,7 @@ def nms_ir( coord_start, id_index, score_index, + return_indices, ): """Low level IR routing for transform location in multibox_detection operator. @@ -347,6 +349,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) + indices = ib.buffer_ptr(indices) num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local") max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) @@ -441,6 +444,12 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): with ib.else_scope(): num_valid_boxes[0] += 1 + with ib.if_scope(return_indices): + with ib.if_scope(j < valid_count[i]): + box_idx = box_indices[i * num_anchors + j] + with ib.if_scope(box_idx >= 0): + box_indices[i * num_anchors + j] = indices[i * num_anchors + box_idx] + return ib.get() @@ -564,14 +573,16 @@ def non_max_suppression( ) data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8) out, box_indices = te.extern( [data.shape, score_shape], - [data, sort_tensor, valid_count], + [data, sort_tensor, valid_count, indices], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], + ins[3], outs[0], outs[1], max_output_size, @@ -581,9 +592,10 @@ def non_max_suppression( coord_start, id_index, score_index, + return_indices, ), dtype=[data.dtype, "int32"], - in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], + in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf], name="nms", tag="nms", ) diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index 3be5d51f5ee4..3fd8837a885e 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -76,7 +76,7 @@ def check_result( if flatten: r = r.flatten() e = e.flatten() - tvm.testing.assert_allclose(r, e, atol=2e-6) + tvm.testing.assert_allclose(r, e, atol=2e-6) def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op): diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 5a5a12c9efe0..9e9aaf842669 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -393,8 +393,8 @@ def verify_nms( intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) - if target == "cuda": - return + if target == "nvptx": + continue op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5) op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data) diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py index 22c9045fd457..6d6353eebce6 100644 --- a/tests/python/topi/python/test_topi_vision.py +++ b/tests/python/topi/python/test_topi_vision.py @@ -202,7 +202,7 @@ def check_device(device): tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx) - if device == "llvm": + if device in ["llvm", "cuda"]: f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device) f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) else: From 9d5176129cc22a53156d02baa7be7232491390c4 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Sat, 5 Dec 2020 03:58:38 +0000 Subject: [PATCH 8/8] Minor fix --- python/tvm/topi/cuda/nms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 7c7c80d1e5ce..82625ffac557 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -444,7 +444,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): with ib.else_scope(): num_valid_boxes[0] += 1 - with ib.if_scope(return_indices): + if return_indices: with ib.if_scope(j < valid_count[i]): box_idx = box_indices[i * num_anchors + j] with ib.if_scope(box_idx >= 0):