From f8ef7f725bddb8631f9c6363177886554d0d5a66 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Mon, 30 Nov 2020 23:26:53 +0000
Subject: [PATCH 1/8] Add rearrange_indices

---
 python/tvm/topi/cuda/nms.py    | 56 +++++++++++++++++++++++++++++++++-
 tests/python/relay/test_any.py | 41 ++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index ed6e8f086a0d..48e7ba382464 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -53,6 +53,51 @@ def opencl_atomic_add_rule(op):
 def atomic_add(x, y):
     return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
 
+def rearrange_indices_out_ir(data, out, valid_box_count):
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+
+    ib = tvm.tir.ir_builder.create()
+    data = ib.buffer_ptr(data)
+    out = ib.buffer_ptr(out)
+    valid_box_count = ib.buffer_ptr(valid_box_count)
+
+    one_count = tvm.tir.const(1, dtype="int32")
+    atomic_add_return = ib.allocate(
+        valid_box_count.dtype, (batch_size,), name="atomic_add_return", scope="local"
+    )
+
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    nthread_tx = max_threads
+    tx = te.thread_axis("threadIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    len_inner_for = (batch_size * num_anchors) // nthread_tx + 1
+
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    with ib.for_range(0, len_inner_for, name="i") as i:
+        idx = tx * len_inner_for + i
+        batch_idx = idxd(idx, num_anchors)
+        with ib.if_scope(idx < batch_size):
+            valid_box_count[idx] = 0
+            atomic_add_return[idx] = 0
+        with ib.if_scope(idx < batch_size * num_anchors):
+            with ib.if_scope(data[idx] >= 0):
+                out[batch_idx * num_anchors + valid_box_count[batch_idx]] = data[idx]
+                atomic_add_return[batch_idx] = atomic_add(
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count
+                )
+            with ib.if_scope(tvm.tir.any(data[idx] > num_anchors, data[idx] < -num_anchors)):
+                out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0.0
+                atomic_add_return[batch_idx] = atomic_add(
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count
+                )
+            with ib.if_scope(idxm(idx, num_anchors) >= atomic_add_return[batch_idx]):
+                out[idx] = -1.0
+
+    return ib.get()
+
 
 def get_valid_counts_ir(
     data, valid_count, out, out_indices, score_threshold, id_index, score_index
@@ -527,6 +572,15 @@ def non_max_suppression(
     )
     # TODO(yongwww): Update cuda nms to be consistent with cpu version
     if return_indices:
-        return box_indices
+        out_buf = tvm.tir.decl_buffer(out.shape, out.dtype, "out_buf", data_alignment=8)
+        return te.extern(
+            [out.shape, valid_count.shape],
+            [out],
+            lambda ins, outs: rearrange_indices_out_ir(ins[0], outs[0], outs[1]),
+            dtype=[out.dtype, valid_count.dtype],
+            in_buffers=[out_buf],
+            name="rearrange_indices_out",
+            tag="rearrange_indices_out",
+        )
 
     return out
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 546973704fea..4af0808f2542 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -25,6 +25,10 @@
 from utils.assert_diagnostic import DiagnosticTesting
 import tvm.topi.testing
 
+import os
+#print(os.getpid())
+#input("dummy breakpoint")
+
 
 def int32(val):
     return relay.const(val, "int32")
@@ -1369,6 +1373,41 @@ def test_any_where():
         any_dims(2), any_dims(2), any_dims(2), (3, 4), (3, 1), (1, 4), y_np_shape_invalid=(2, 4)
     )
 
+@tvm.testing.uses_gpu
+def test_non_max_suppression():
+    dshape = (te.size_var("n"), 5, 6)
+    x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 5), "float32"))
+    x1 = relay.var("x1", relay.ty.TensorType((1,), "int32"))
+    x2 = relay.var("x2", relay.ty.TensorType((1, relay.Any()), "int32"))
+    x3 = relay.var("x3", relay.ty.TensorType((), "int32"))
+    z = relay.vision.non_max_suppression(
+            x0,
+            x1,
+            x2,
+            x3,
+            iou_threshold=0.6,
+            force_suppress=True,
+            top_k=-1,
+            coord_start=1,
+            score_index=0,
+            id_index=-1,
+            return_indices=True,
+            invalid_to_bottom=False
+        )
+    z = z.astuple()
+    func = relay.Function([x0, x1, x2, x3], z)
+    mod = tvm.IRModule()
+    mod["main"] = func
+    mod = tvm.relay.transform.InferType()(mod)
+    print(mod)
+    with tvm.transform.PassContext(
+            opt_level=3,
+            disabled_pass=["FoldScaleAxis"],
+            ):
+        vm_exec = relay.vm.compile(mod, target='cuda', params=dict())
+        code, lib = vm_exec.save()
+        lib.export_library('/tmp/my_lib.so')
+
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    test_non_max_suppression()

From 61e02e14f0822a8dfcd9cfa241db4b4888a53480 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Tue, 1 Dec 2020 01:38:01 +0000
Subject: [PATCH 2/8] Fix output type

---
 python/tvm/topi/cuda/nms.py    |  23 ++++---
 tests/python/relay/test_any.py | 108 +++++++++++++++++++++------------
 2 files changed, 83 insertions(+), 48 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 48e7ba382464..30718aad700e 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -53,6 +53,7 @@ def opencl_atomic_add_rule(op):
 def atomic_add(x, y):
     return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
 
+
 def rearrange_indices_out_ir(data, out, valid_box_count):
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
@@ -86,15 +87,17 @@ def rearrange_indices_out_ir(data, out, valid_box_count):
             with ib.if_scope(data[idx] >= 0):
                 out[batch_idx * num_anchors + valid_box_count[batch_idx]] = data[idx]
                 atomic_add_return[batch_idx] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]),
+                    one_count,
                 )
             with ib.if_scope(tvm.tir.any(data[idx] > num_anchors, data[idx] < -num_anchors)):
-                out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0.0
+                out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0
                 atomic_add_return[batch_idx] = atomic_add(
-                    tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]), one_count
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]),
+                    one_count,
                 )
             with ib.if_scope(idxm(idx, num_anchors) >= atomic_add_return[batch_idx]):
-                out[idx] = -1.0
+                out[idx] = -1
 
     return ib.get()
 
@@ -570,14 +573,16 @@ def non_max_suppression(
         name="nms",
         tag="nms",
     )
-    # TODO(yongwww): Update cuda nms to be consistent with cpu version
+
     if return_indices:
-        out_buf = tvm.tir.decl_buffer(out.shape, out.dtype, "out_buf", data_alignment=8)
+        out_buf = tvm.tir.decl_buffer(
+            box_indices.shape, box_indices.dtype, "out_buf", data_alignment=8
+        )
         return te.extern(
-            [out.shape, valid_count.shape],
-            [out],
+            [box_indices.shape, (batch_size, 1)],
+            [box_indices],
             lambda ins, outs: rearrange_indices_out_ir(ins[0], outs[0], outs[1]),
-            dtype=[out.dtype, valid_count.dtype],
+            dtype=[box_indices.dtype, valid_count.dtype],
             in_buffers=[out_buf],
             name="rearrange_indices_out",
             tag="rearrange_indices_out",
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 4af0808f2542..3e53c2af1853 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -26,8 +26,9 @@
 import tvm.topi.testing
 
 import os
-#print(os.getpid())
-#input("dummy breakpoint")
+
+# print(os.getpid())
+# input("dummy breakpoint")
 
 
 def int32(val):
@@ -42,27 +43,43 @@ def any_dims(ndim):
 
 
 def check_result(
-    args, mod, expected, flatten=False, assert_shape=False, only_vm=False, targets=None
+    args,
+    mod,
+    expected,
+    flatten=False,
+    assert_shape=False,
+    only_vm=False,
+    targets=None,
+    disable_targets=None,
 ):
+    if not isinstance(expected, list):
+        expected = [expected]
     for kind in ["debug", "vm"]:
         targets = targets or tvm.testing.enabled_targets()
         for tgt, ctx in targets:
+            if disable_targets and tgt in disable_targets:
+                continue
             if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type):
                 continue
             ex = relay.create_executor(kind, mod=mod, ctx=ctx, target=tgt)
             result = ex.evaluate()(*args)
-            result = result.asnumpy()
-            if assert_shape:
-                assert result.shape == expected, "Shape mismatch: expect %s but got %s." % (
-                    str(expected),
-                    str(result.shape),
-                )
-                return
+            if isinstance(result, tvm.runtime.container.ADT):
+                result = [r.asnumpy() for r in result]
+            else:
+                result = [result.asnumpy()]
+
+            for r, e in zip(result, expected):
+                if assert_shape:
+                    assert r.shape == e, "Shape mismatch: expect %s but got %s." % (
+                        str(e),
+                        str(r),
+                    )
+                    return
 
-            if flatten:
-                result = result.flatten()
-                expected = expected.flatten()
-            tvm.testing.assert_allclose(result, expected, atol=2e-6)
+                if flatten:
+                    r = r.flatten()
+                    e = e.flatten()
+            tvm.testing.assert_allclose(r, e, atol=2e-6)
 
 
 def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
@@ -1373,41 +1390,54 @@ def test_any_where():
         any_dims(2), any_dims(2), any_dims(2), (3, 4), (3, 1), (1, 4), y_np_shape_invalid=(2, 4)
     )
 
+
 @tvm.testing.uses_gpu
 def test_non_max_suppression():
-    dshape = (te.size_var("n"), 5, 6)
-    x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 5), "float32"))
+    x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 6), "float32"))
     x1 = relay.var("x1", relay.ty.TensorType((1,), "int32"))
     x2 = relay.var("x2", relay.ty.TensorType((1, relay.Any()), "int32"))
     x3 = relay.var("x3", relay.ty.TensorType((), "int32"))
     z = relay.vision.non_max_suppression(
-            x0,
-            x1,
-            x2,
-            x3,
-            iou_threshold=0.6,
-            force_suppress=True,
-            top_k=-1,
-            coord_start=1,
-            score_index=0,
-            id_index=-1,
-            return_indices=True,
-            invalid_to_bottom=False
-        )
+        x0,
+        x1,
+        x2,
+        x3,
+        iou_threshold=0.5,
+        force_suppress=True,
+        top_k=2,
+        return_indices=True,
+        invalid_to_bottom=False,
+    )
     z = z.astuple()
     func = relay.Function([x0, x1, x2, x3], z)
     mod = tvm.IRModule()
     mod["main"] = func
-    mod = tvm.relay.transform.InferType()(mod)
-    print(mod)
-    with tvm.transform.PassContext(
-            opt_level=3,
-            disabled_pass=["FoldScaleAxis"],
-            ):
-        vm_exec = relay.vm.compile(mod, target='cuda', params=dict())
-        code, lib = vm_exec.save()
-        lib.export_library('/tmp/my_lib.so')
+
+    np_data = np.array(
+        [
+            [
+                [0, 0.8, 1, 20, 25, 45],
+                [1, 0.7, 30, 60, 50, 80],
+                [0, 0.4, 4, 21, 19, 40],
+                [2, 0.9, 35, 61, 52, 79],
+                [1, 0.5, 100, 60, 70, 110],
+            ]
+        ]
+    ).astype("float32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
+    np_max_output_size = -1
+    np_indices_result = np.array([[4, 0, -1, -1, -1]])
+    np_valid_box_count = np.array([[2]]).astype("int32")
+
+    check_result(
+        [np_data, np_valid_count, np_indices, np_max_output_size],
+        mod,
+        [np_indices_result, np_valid_box_count],
+        only_vm=False,
+        disable_targets=["nvptx"],
+    )
 
 
 if __name__ == "__main__":
-    test_non_max_suppression()
+    pytest.main([__file__])

From 7483c746ac2aa72dc63e9c2ef3e5a8b9ebe29cfb Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Tue, 1 Dec 2020 01:41:29 +0000
Subject: [PATCH 3/8] Clean test

---
 tests/python/relay/test_any.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 3e53c2af1853..f290745e415d 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -27,9 +27,6 @@
 
 import os
 
-# print(os.getpid())
-# input("dummy breakpoint")
-
 
 def int32(val):
     return relay.const(val, "int32")

From ced8c7b14b5faab126ea77744e7b42fb5d2a4a59 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Tue, 1 Dec 2020 01:59:07 +0000
Subject: [PATCH 4/8] Fix pylint

---
 python/tvm/topi/cuda/nms.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 30718aad700e..53ef9d44ff33 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -55,6 +55,20 @@ def atomic_add(x, y):
 
 
 def rearrange_indices_out_ir(data, out, valid_box_count):
+    """Hybrid routine to rearrange nms output to
+    move all valid entries to top.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor or numpy NDArray
+        tensor with shape [batch_size, num_anchors].
+
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
 

From 30c0177d6a4e1744535337543692a161040a255e Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Tue, 1 Dec 2020 19:01:20 +0000
Subject: [PATCH 5/8] Fix CPU nms multi-batch

---
 python/tvm/topi/vision/nms.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py
index 76e1808698e5..b076fde9ac6e 100644
--- a/python/tvm/topi/vision/nms.py
+++ b/python/tvm/topi/vision/nms.py
@@ -52,15 +52,16 @@ def hybrid_rearrange_box_out(data, one, batch_size, num_anchors):
     """
     elem_length = data.shape[2]
     output = output_tensor((batch_size, num_anchors, elem_length), data.dtype)
+    valid_indices = allocate((batch_size,), "int32")
 
     for i in parallel(batch_size):
-        valid_idx = 0
+        valid_indices[i] = 0
         for j in range(num_anchors):
             if data[i, j, 0] >= 0:
                 for k in range(elem_length):
-                    output[i, valid_idx, k] = data[i, j, k]
-                valid_idx += 1
-            if j >= valid_idx:
+                    output[i, valid_indices[i], k] = data[i, j, k]
+                valid_indices[i] += 1
+            if j >= valid_indices[i]:
                 for k in range(elem_length):
                     output[i, j, k] = -one
     return output
@@ -100,19 +101,20 @@ def hybrid_rearrange_indices_out(data, one, batch_size, num_anchors):
     """
     valid_box_count = output_tensor((batch_size, 1), "int32")
     output = output_tensor((batch_size, num_anchors), data.dtype)
+    valid_indices = allocate((batch_size,), "int32")
 
     for i in parallel(batch_size):
-        valid_idx = 0
+        valid_indices[i] = 0
         for j in range(num_anchors):
             if data[i, j] >= 0:
-                output[i, valid_idx] = data[i, j]
-                valid_idx += 1
+                output[i, valid_indices[i]] = data[i, j]
+                valid_indices[i] += 1
             if data[i, j] > num_anchors or data[i, j] < -num_anchors:
-                output[i, valid_idx] = 0
-                valid_idx += 1
-            if j >= valid_idx:
+                output[i, valid_indices[i]] = 0
+                valid_indices[i] += 1
+            if j >= valid_indices[i]:
                 output[i, j] = -one
-        valid_box_count[i, 0] = valid_idx
+        valid_box_count[i, 0] = valid_indices[i]
 
     return output, valid_box_count
 

From e163cf88848bf0a6e126b2073062c7b6a6b5e08f Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Thu, 3 Dec 2020 21:56:52 +0000
Subject: [PATCH 6/8] Diable test

---
 tests/python/relay/test_any.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index f290745e415d..3be5d51f5ee4 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -1388,7 +1388,8 @@ def test_any_where():
     )
 
 
-@tvm.testing.uses_gpu
+# TODO(kevinthesun): enable gpu test when Thrust is available in ci.
+# @tvm.testing.uses_gpu
 def test_non_max_suppression():
     x0 = relay.var("x0", relay.ty.TensorType((1, relay.Any(), 6), "float32"))
     x1 = relay.var("x1", relay.ty.TensorType((1,), "int32"))

From 496c4b2c381e3cd612ebf88cfba0323dc4e448a8 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Fri, 4 Dec 2020 22:50:22 +0000
Subject: [PATCH 7/8] Minor fix

---
 python/tvm/topi/cuda/nms.py                  | 26 ++++++++++++++------
 tests/python/relay/test_any.py               |  2 +-
 tests/python/relay/test_op_level5.py         |  4 +--
 tests/python/topi/python/test_topi_vision.py |  2 +-
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 53ef9d44ff33..7c7c80d1e5ce 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -86,7 +86,7 @@ def rearrange_indices_out_ir(data, out, valid_box_count):
     nthread_tx = max_threads
     tx = te.thread_axis("threadIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    len_inner_for = (batch_size * num_anchors) // nthread_tx + 1
+    len_inner_for = (batch_size * num_anchors) // nthread_tx + 2
 
     idxd = tvm.tir.indexdiv
     idxm = tvm.tir.indexmod
@@ -96,21 +96,21 @@ def rearrange_indices_out_ir(data, out, valid_box_count):
         batch_idx = idxd(idx, num_anchors)
         with ib.if_scope(idx < batch_size):
             valid_box_count[idx] = 0
-            atomic_add_return[idx] = 0
         with ib.if_scope(idx < batch_size * num_anchors):
             with ib.if_scope(data[idx] >= 0):
-                out[batch_idx * num_anchors + valid_box_count[batch_idx]] = data[idx]
                 atomic_add_return[batch_idx] = atomic_add(
                     tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]),
                     one_count,
                 )
+                out[batch_idx * num_anchors + atomic_add_return[batch_idx]] = data[idx]
             with ib.if_scope(tvm.tir.any(data[idx] > num_anchors, data[idx] < -num_anchors)):
-                out[batch_idx * num_anchors + valid_box_count[batch_idx]] = 0
                 atomic_add_return[batch_idx] = atomic_add(
                     tvm.tir.call_intrin("handle", "tir.address_of", valid_box_count[batch_idx]),
                     one_count,
                 )
-            with ib.if_scope(idxm(idx, num_anchors) >= atomic_add_return[batch_idx]):
+                out[batch_idx * num_anchors + atomic_add_return[batch_idx]] = 0
+
+            with ib.if_scope(idxm(idx, num_anchors) >= valid_box_count[batch_idx]):
                 out[idx] = -1
 
     return ib.get()
@@ -260,6 +260,7 @@ def nms_ir(
     data,
     sorted_index,
     valid_count,
+    indices,
     out,
     box_indices,
     max_output_size,
@@ -269,6 +270,7 @@ def nms_ir(
     coord_start,
     id_index,
     score_index,
+    return_indices,
 ):
     """Low level IR routing for transform location in multibox_detection operator.
 
@@ -347,6 +349,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
     box_indices = ib.buffer_ptr(box_indices)
+    indices = ib.buffer_ptr(indices)
     num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
@@ -441,6 +444,12 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                     with ib.else_scope():
                         num_valid_boxes[0] += 1
 
+        with ib.if_scope(return_indices):
+            with ib.if_scope(j < valid_count[i]):
+                box_idx = box_indices[i * num_anchors + j]
+                with ib.if_scope(box_idx >= 0):
+                    box_indices[i * num_anchors + j] = indices[i * num_anchors + box_idx]
+
     return ib.get()
 
 
@@ -564,14 +573,16 @@ def non_max_suppression(
     )
 
     data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8)
 
     out, box_indices = te.extern(
         [data.shape, score_shape],
-        [data, sort_tensor, valid_count],
+        [data, sort_tensor, valid_count, indices],
         lambda ins, outs: nms_ir(
             ins[0],
             ins[1],
             ins[2],
+            ins[3],
             outs[0],
             outs[1],
             max_output_size,
@@ -581,9 +592,10 @@ def non_max_suppression(
             coord_start,
             id_index,
             score_index,
+            return_indices,
         ),
         dtype=[data.dtype, "int32"],
-        in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
+        in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
         name="nms",
         tag="nms",
     )
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 3be5d51f5ee4..3fd8837a885e 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -76,7 +76,7 @@ def check_result(
                 if flatten:
                     r = r.flatten()
                     e = e.flatten()
-            tvm.testing.assert_allclose(r, e, atol=2e-6)
+                tvm.testing.assert_allclose(r, e, atol=2e-6)
 
 
 def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op):
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 5a5a12c9efe0..9e9aaf842669 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -393,8 +393,8 @@ def verify_nms(
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
-            if target == "cuda":
-                return
+            if target == "nvptx":
+                continue
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 22c9045fd457..6d6353eebce6 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -202,7 +202,7 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
         tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        if device == "llvm":
+        if device in ["llvm", "cuda"]:
             f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
             f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
         else:

From 9d5176129cc22a53156d02baa7be7232491390c4 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Sat, 5 Dec 2020 03:58:38 +0000
Subject: [PATCH 8/8] Minor fix

---
 python/tvm/topi/cuda/nms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 7c7c80d1e5ce..82625ffac557 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -444,7 +444,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                     with ib.else_scope():
                         num_valid_boxes[0] += 1
 
-        with ib.if_scope(return_indices):
+        if return_indices:
             with ib.if_scope(j < valid_count[i]):
                 box_idx = box_indices[i * num_anchors + j]
                 with ib.if_scope(box_idx >= 0):