From b86700fe679a9456f2e5740bdb75b02e2fc0c2a9 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Wed, 22 Jul 2020 14:31:44 +0800
Subject: [PATCH 01/18] fix nms for gpu

---
 tests/python/relay/test_op_level5.py |   8 +-
 topi/python/topi/cuda/nms.py         | 157 ++++++++++++++++++++++-----
 2 files changed, 130 insertions(+), 35 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 3a94fc69e001..aa4a6e064d60 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -270,9 +270,9 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
-            # get_valid_count for cuda, opencl doesn't do data rearrangement
-            if target in ['cuda', 'opencl']:
-                return
+            # get_valid_count for opencl doesn't do data rearrangement
+            if target in ['opencl']:
+                continue
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)
 
@@ -321,8 +321,6 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
-            if target == 'cuda':
-                return
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 4772080a60eb..00e410707610 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -93,8 +93,6 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
     out_indices = ib.buffer_ptr(out_indices)
-    atomic_add_return = ib.allocate(
-        valid_count.dtype, (1,), name='atomic_add_return', scope='local')
     one_count = tvm.tir.const(1, dtype=valid_count.dtype)
     one = tvm.tir.const(1, dtype=out.dtype)
     score_threshold = tvm.ir.make_node(
@@ -102,35 +100,29 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
     id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index)
     score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index)
 
-    max_threads = int(tvm.target.Target.current(
-        allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = batch_size * num_anchors // max_threads + 1
+    nthread_tx = batch_size
+    nthread_bx = 1
     tx = te.thread_axis("threadIdx.x")
     bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-    idxd = tvm.tir.indexdiv
-
-    # initialize valid_count
-    with ib.if_scope(tid < batch_size):
-        valid_count[tid] = 0
-    with ib.if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors)
+    tid = tx
+
+    # each thread process one batch
+    valid_count[tid] = 0
+    with ib.for_range(0, num_anchors) as anchor_ind:
+        with ib.for_range(0, elem_length) as k:
+            out[anchor_ind * elem_length + k] = -one
+        out_indices[anchor_ind + tid*num_anchors] = -one_count
+
+    with ib.for_range(0, num_anchors) as anchor_ind:
         with ib.if_scope(
-                tvm.tir.all(data[tid * elem_length + score_index] > score_threshold,
-                            tvm.tir.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
-            atomic_add_return[0] = atomic_add(tvm.tir.call_intrin("handle", "tir.address_of",
-                                                                       valid_count[i]), one_count)
+                tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold,
+                            tvm.tir.any(id_index < 0, data[anchor_ind * elem_length + id_index] >= 0))):
+            valid_count[tid] = valid_count[tid] + 1
             with ib.for_range(0, elem_length) as k:
-                out[tid * elem_length + k] = data[tid * elem_length + k]
-                out_indices[tid + k] = tid + k
-        with ib.else_scope():
-            with ib.for_range(0, elem_length) as k:
-                out[tid * elem_length + k] = -one
-                out_indices[tid + k] = -one_count
-
+                out[(valid_count[tid]-1) * elem_length + k] = data[anchor_ind * elem_length + k]
+            out_indices[(valid_count[tid]-1) + tid*num_anchors] = anchor_ind
     return ib.get()
 
 
@@ -184,7 +176,101 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     return [valid_count, out, out_indices]
 
 
-def nms_ir(data, sorted_index, valid_count, out, box_indices,
+def rearrange_indices_out_ir(data, output, valid_box_count):
+    """Low level IR to get rearrange_indices_out.
+    Parameters
+    ----------
+    data : Buffer
+        Input data. 2-D Buffer with shape [batch_size, num_anchors].
+
+    output: Buffer
+        2-D Buffer with shape [batch_size, num_anchors].
+
+    valid_box_count : Buffer
+        2-D Buffer with shape [batch_size, 1].
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    ib = tvm.tir.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    output = ib.buffer_ptr(output)
+    valid_box_count = ib.buffer_ptr(valid_box_count)
+
+    one = tvm.tir.const(1, dtype=output.dtype)
+    zero = tvm.tir.const(0, dtype=output.dtype)
+
+    nthread_tx = batch_size
+    nthread_bx = 1
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = tx
+    
+    valid_box_count[tid] = 0   
+    with ib.for_range(0, num_anchors) as anchor_ind:
+        output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind]
+    '''
+    valid_box_count[tid] = 0    
+    with ib.for_range(0, num_anchors) as anchor_ind:
+        with ib.if_scope(data[tid * num_anchors + anchor_ind] >= zero):
+            output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind]
+            valid_box_count[tid] = valid_box_count[tid] + 1
+        with ib.if_scope(tvm.tir.any(data[tid * num_anchors + anchor_ind] > num_anchors, 
+                                     data[tid * num_anchors + anchor_ind] < -num_anchors)):
+            output[tid * num_anchors + valid_box_count[tid]] = zero
+            valid_box_count[tid] = valid_box_count[tid] + 1
+        with ib.if_scope(anchor_ind >= valid_box_count[tid]):
+            output[tid * num_anchors + anchor_ind] = -one
+    '''
+    return ib.get()
+
+
+def rearrange_indices_out(data):
+    """Rearrange nms output to move all valid entries to top.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor or numpy NDArray
+        NMS output.  2-D
+        tensor with shape [batch_size, num_anchors].
+
+    Returns
+    -------
+    output : tvm.te.Tensor or numpy NDArray
+        2-D tensor with shape [batch_size, num_anchors].
+
+    valid_box_count : tvm.te.Tensor or numpy NDArray
+        Tensor with shape [batch_size, 1], indicates
+        the valid number of boxes.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    data_buf = tvm.tir.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
+    out_indices_buf = tvm.tir.decl_buffer(
+        data.shape, data.dtype, "out_indices_buf", data_alignment=8)
+    valid_count_buf = tvm.tir.decl_buffer(
+        (batch_size, 1), "int32", "valid_count_buf", data_alignment=8)
+    
+    output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape], 
+                                        [data],
+                                        lambda ins, outs: rearrange_indices_out_ir(
+                                            ins[0], outs[0], outs[1]),
+                                        in_buffers=[data_buf],
+                                        out_buffers=[out_indices_buf, valid_count_buf],
+                                        name="rearrange_indices_out",
+                                        tag="rearrange_indices_out_gpu")
+    return [output, valid_box_count]
+
+
+def nms_ir(data, sorted_index, valid_count, indices, out, box_indices,
            max_output_size, iou_threshold, force_suppress,
            top_k, coord_start, id_index, score_index):
     """Low level IR routing for transform location in multibox_detection operator.
@@ -200,6 +286,9 @@ def nms_ir(data, sorted_index, valid_count, out, box_indices,
     valid_count : Buffer
         Buffer of number of valid output boxes.
 
+    indices : Buffer
+        Buffer represents the index of box in original data.
+
     out : Buffer
         Output buffer.
 
@@ -253,6 +342,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     data = ib.buffer_ptr(data)
     sorted_index = ib.buffer_ptr(sorted_index)
     valid_count = ib.buffer_ptr(valid_count)
+    indices = ib.buffer_ptr(indices)
     out = ib.buffer_ptr(out)
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.allocate(
@@ -346,6 +436,10 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                         box_indices[i * num_anchors + j] = -1
                     with ib.else_scope():
                         num_valid_boxes[0] += 1
+        # convert box_indices to represent original data
+        org_idx = box_indices[i * num_anchors + j]
+        with ib.if_scope(box_indices[i * num_anchors + j] >= 0):
+            box_indices[i * num_anchors + j] = indices[i * num_anchors + org_idx]
 
     return ib.get()
 
@@ -454,19 +548,22 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
     data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
 
+    indices_buf = tvm.tir.decl_buffer(
+        indices.shape, indices.dtype, "indices_buf", data_alignment=8)
+
     out, box_indices = \
         te.extern([data.shape, score_shape],
-                  [data, sort_tensor, valid_count],
+                  [data, sort_tensor, valid_count, indices],
                   lambda ins, outs: nms_ir(
-            ins[0], ins[1], ins[2], outs[0], outs[1],
+            ins[0], ins[1], ins[2], ins[3], outs[0], outs[1],
             max_output_size, iou_threshold, force_suppress,
             top_k, coord_start, id_index, score_index),
             dtype=[data.dtype, "int32"],
-            in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
+            in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
             name="nms",
             tag="nms")
     # TODO(yongwww): Update cuda nms to be consistent with cpu version
     if return_indices:
-        return box_indices
+        return rearrange_indices_out(box_indices)
 
     return out

From 84ba8a8cb90083b3ec41170f1aab12c957cacbdd Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Wed, 22 Jul 2020 15:03:26 +0800
Subject: [PATCH 02/18] fix pylint error

---
 topi/python/topi/cuda/nms.py | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 00e410707610..26dcc5b508fe 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -118,7 +118,8 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
     with ib.for_range(0, num_anchors) as anchor_ind:
         with ib.if_scope(
                 tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold,
-                            tvm.tir.any(id_index < 0, data[anchor_ind * elem_length + id_index] >= 0))):
+                            tvm.tir.any(id_index < 0, 
+                                        data[anchor_ind * elem_length + id_index] >= 0))):
             valid_count[tid] = valid_count[tid] + 1
             with ib.for_range(0, elem_length) as k:
                 out[(valid_count[tid]-1) * elem_length + k] = data[anchor_ind * elem_length + k]
@@ -202,9 +203,6 @@ def rearrange_indices_out_ir(data, output, valid_box_count):
     output = ib.buffer_ptr(output)
     valid_box_count = ib.buffer_ptr(valid_box_count)
 
-    one = tvm.tir.const(1, dtype=output.dtype)
-    zero = tvm.tir.const(0, dtype=output.dtype)
-
     nthread_tx = batch_size
     nthread_bx = 1
     tx = te.thread_axis("threadIdx.x")
@@ -213,22 +211,9 @@ def rearrange_indices_out_ir(data, output, valid_box_count):
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = tx
     
-    valid_box_count[tid] = 0   
+    valid_box_count[tid] = 0
     with ib.for_range(0, num_anchors) as anchor_ind:
         output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind]
-    '''
-    valid_box_count[tid] = 0    
-    with ib.for_range(0, num_anchors) as anchor_ind:
-        with ib.if_scope(data[tid * num_anchors + anchor_ind] >= zero):
-            output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind]
-            valid_box_count[tid] = valid_box_count[tid] + 1
-        with ib.if_scope(tvm.tir.any(data[tid * num_anchors + anchor_ind] > num_anchors, 
-                                     data[tid * num_anchors + anchor_ind] < -num_anchors)):
-            output[tid * num_anchors + valid_box_count[tid]] = zero
-            valid_box_count[tid] = valid_box_count[tid] + 1
-        with ib.if_scope(anchor_ind >= valid_box_count[tid]):
-            output[tid * num_anchors + anchor_ind] = -one
-    '''
     return ib.get()
 
 
@@ -251,7 +236,6 @@ def rearrange_indices_out(data):
         the valid number of boxes.
     """
     batch_size = data.shape[0]
-    num_anchors = data.shape[1]
     data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
     out_indices_buf = tvm.tir.decl_buffer(

From e95d2a47291ef4f849548b329540beac0b957e89 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Wed, 22 Jul 2020 15:10:09 +0800
Subject: [PATCH 03/18] fix pylint error

---
 topi/python/topi/cuda/nms.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 26dcc5b508fe..6f7e3c7f2bf2 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -118,7 +118,7 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
     with ib.for_range(0, num_anchors) as anchor_ind:
         with ib.if_scope(
                 tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold,
-                            tvm.tir.any(id_index < 0, 
+                            tvm.tir.any(id_index < 0,
                                         data[anchor_ind * elem_length + id_index] >= 0))):
             valid_count[tid] = valid_count[tid] + 1
             with ib.for_range(0, elem_length) as k:
@@ -210,7 +210,7 @@ def rearrange_indices_out_ir(data, output, valid_box_count):
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = tx
-    
+
     valid_box_count[tid] = 0
     with ib.for_range(0, num_anchors) as anchor_ind:
         output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind]
@@ -242,8 +242,8 @@ def rearrange_indices_out(data):
         data.shape, data.dtype, "out_indices_buf", data_alignment=8)
     valid_count_buf = tvm.tir.decl_buffer(
         (batch_size, 1), "int32", "valid_count_buf", data_alignment=8)
-    
-    output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape], 
+
+    output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape],
                                         [data],
                                         lambda ins, outs: rearrange_indices_out_ir(
                                             ins[0], outs[0], outs[1]),

From 4b4e94c6f9a014cf95db304dbee33137d5067fe3 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Wed, 22 Jul 2020 20:11:54 +0800
Subject: [PATCH 04/18] fix get_valid_counts for batch larger than 1

---
 topi/python/topi/cuda/nms.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 6f7e3c7f2bf2..1df1391e4735 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -110,20 +110,24 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
 
     # each thread process one batch
     valid_count[tid] = 0
+    data_base_ind = tid * num_anchors * elem_length
+    ind_base_ind = tid * num_anchors
     with ib.for_range(0, num_anchors) as anchor_ind:
         with ib.for_range(0, elem_length) as k:
-            out[anchor_ind * elem_length + k] = -one
-        out_indices[anchor_ind + tid*num_anchors] = -one_count
+            out[data_base_ind + anchor_ind * elem_length + k] = -one
+        out_indices[ind_base_ind + anchor_ind] = -one_count
 
     with ib.for_range(0, num_anchors) as anchor_ind:
         with ib.if_scope(
-                tvm.tir.all(data[anchor_ind * elem_length + score_index] > score_threshold,
-                            tvm.tir.any(id_index < 0,
-                                        data[anchor_ind * elem_length + id_index] >= 0))):
+                tvm.tir.all(
+                    data[data_base_ind + anchor_ind * elem_length + score_index] > score_threshold,
+                    tvm.tir.any(id_index < 0,
+                                data[data_base_ind + anchor_ind * elem_length + id_index] >= 0))):
             valid_count[tid] = valid_count[tid] + 1
             with ib.for_range(0, elem_length) as k:
-                out[(valid_count[tid]-1) * elem_length + k] = data[anchor_ind * elem_length + k]
-            out_indices[(valid_count[tid]-1) + tid*num_anchors] = anchor_ind
+                out[data_base_ind + (valid_count[tid]-1) * elem_length + k] = \
+                    data[data_base_ind + anchor_ind * elem_length + k]
+            out_indices[ind_base_ind + (valid_count[tid]-1)] = anchor_ind
     return ib.get()
 
 
From f8040723b70a49ba21adcd653b83dad6e96348e9 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Thu, 23 Jul 2020 10:01:11 +0800
Subject: [PATCH 05/18] remove add_atomic

---
 tests/python/relay/test_op_level5.py |  3 ---
 topi/python/topi/cuda/nms.py         | 26 --------------------------
 2 files changed, 29 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index aa4a6e064d60..328a1f27e22d 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -270,9 +270,6 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
-            # get_valid_count for opencl doesn't do data rearrangement
-            if target in ['opencl']:
-                continue
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)
 
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 1df1391e4735..c9dbea7c5c99 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -25,32 +25,6 @@
 from .. import tag
 
 
-def cuda_atomic_add_rule(op):
-    if op.dtype == "float32":
-        return tvm.tir.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
-    if op.dtype == "float64":
-        return tvm.tir.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
-    if op.dtype == "int32":
-        return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
-    raise RuntimeError("only support int32, float32 and float64")
-
-def opencl_atomic_add_rule(op):
-    if op.dtype == "int32":
-        return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
-    raise RuntimeError("only support int32")
-
-tvm.target.intrin.register_intrin_rule(
-    "cuda", "atomic_add", cuda_atomic_add_rule, override=True)
-
-tvm.target.intrin.register_intrin_rule(
-    "opencl", "atomic_add", opencl_atomic_add_rule, override=True)
-
-tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
-
-def atomic_add(x, y):
-    return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
-
-
 def get_valid_counts_ir(data, valid_count, out, out_indices,
                         score_threshold, id_index, score_index):
     """Low level IR to get valid count of bounding boxes

From b43f3ec0295bac005ce9e251f8c11178f2ddfe53 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Fri, 24 Jul 2020 10:22:47 +0800
Subject: [PATCH 06/18] wakey wakey GitHub Actions


From 113985b25de3ac311d58489494baa399d23caf1e Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Mon, 27 Jul 2020 09:33:41 +0800
Subject: [PATCH 07/18] remove get_valid_counts single thread

---
 topi/python/topi/cuda/nms.py | 117 ++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 65 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index c9dbea7c5c99..364591da71f4 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -25,32 +25,51 @@
 from .. import tag
 
 
+def cuda_atomic_add_rule(op):
+    if op.dtype == "float32":
+        return tvm.tir.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
+    if op.dtype == "float64":
+        return tvm.tir.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
+    if op.dtype == "int32":
+        return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
+    raise RuntimeError("only support int32, float32 and float64")
+
+def opencl_atomic_add_rule(op):
+    if op.dtype == "int32":
+        return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
+    raise RuntimeError("only support int32")
+
+tvm.target.intrin.register_intrin_rule(
+    "cuda", "atomic_add", cuda_atomic_add_rule, override=True)
+
+tvm.target.intrin.register_intrin_rule(
+    "opencl", "atomic_add", opencl_atomic_add_rule, override=True)
+
+tvm.ir.register_op_attr("tir.atomic_add", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
+
+def atomic_add(x, y):
+    return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
+
+
 def get_valid_counts_ir(data, valid_count, out, out_indices,
                         score_threshold, id_index, score_index):
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also prepares to move valid boxes to the
     top of input data.
-
     Parameters
     ----------
     data : Buffer
         Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
-
     valid_count : Buffer
         1D buffer for valid number of boxes with shape [batch_size, ].
-
     flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
-
     score_threshold : float32
         Lower limit of score for valid bounding boxes.
-
     id_index : optional, int
         index of the class categories, -1 to disable.
-
     score_index: optional, int
         Index of the scores/confidence of boxes.
-
     Returns
     -------
     stmt : Stmt
@@ -67,6 +86,8 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
     out_indices = ib.buffer_ptr(out_indices)
+    atomic_add_return = ib.allocate(
+        valid_count.dtype, (1,), name='atomic_add_return', scope='local')
     one_count = tvm.tir.const(1, dtype=valid_count.dtype)
     one = tvm.tir.const(1, dtype=out.dtype)
     score_threshold = tvm.ir.make_node(
@@ -74,60 +95,55 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
     id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index)
     score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index)
 
-    nthread_tx = batch_size
-    nthread_bx = 1
+    max_threads = int(tvm.target.Target.current(
+        allow_none=False).max_num_threads)
+    nthread_tx = max_threads
+    nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = te.thread_axis("threadIdx.x")
     bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = tx
-
-    # each thread process one batch
-    valid_count[tid] = 0
-    data_base_ind = tid * num_anchors * elem_length
-    ind_base_ind = tid * num_anchors
-    with ib.for_range(0, num_anchors) as anchor_ind:
-        with ib.for_range(0, elem_length) as k:
-            out[data_base_ind + anchor_ind * elem_length + k] = -one
-        out_indices[ind_base_ind + anchor_ind] = -one_count
-
-    with ib.for_range(0, num_anchors) as anchor_ind:
+    tid = bx * max_threads + tx
+    idxd = tvm.tir.indexdiv
+
+    # initialize valid_count
+    with ib.if_scope(tid < batch_size):
+        valid_count[tid] = 0
+    with ib.if_scope(tid < batch_size * num_anchors):
+        i = idxd(tid, num_anchors)
         with ib.if_scope(
-                tvm.tir.all(
-                    data[data_base_ind + anchor_ind * elem_length + score_index] > score_threshold,
-                    tvm.tir.any(id_index < 0,
-                                data[data_base_ind + anchor_ind * elem_length + id_index] >= 0))):
-            valid_count[tid] = valid_count[tid] + 1
+                tvm.tir.all(data[tid * elem_length + score_index] > score_threshold,
+                            tvm.tir.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
+            atomic_add_return[0] = atomic_add(tvm.tir.call_intrin("handle", "tir.address_of",
+                                                                       valid_count[i]), one_count)
             with ib.for_range(0, elem_length) as k:
-                out[data_base_ind + (valid_count[tid]-1) * elem_length + k] = \
-                    data[data_base_ind + anchor_ind * elem_length + k]
-            out_indices[ind_base_ind + (valid_count[tid]-1)] = anchor_ind
+                out[tid * elem_length + k] = data[tid * elem_length + k]
+                out_indices[tid + k] = tid + k
+        with ib.else_scope():
+            with ib.for_range(0, elem_length) as k:
+                out[tid * elem_length + k] = -one
+                out_indices[tid + k] = -one_count
+
     return ib.get()
 
 
 def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
-
     Parameters
     ----------
     data : tvm.te.Tensor
         Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].
-
     score_threshold : optional, float
         Lower limit of score for valid bounding boxes.
-
     id_index : optional, int
         index of the class categories, -1 to disable.
-
     score_index: optional, int
         Index of the scores/confidence of boxes.
-
     Returns
     -------
     valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
-
     out_tensor : tvm.te.Tensor
         Rearranged data tensor.
     """
@@ -236,46 +252,33 @@ def nms_ir(data, sorted_index, valid_count, indices, out, box_indices,
            max_output_size, iou_threshold, force_suppress,
            top_k, coord_start, id_index, score_index):
     """Low level IR routing for transform location in multibox_detection operator.
-
     Parameters
     ----------
     data : Buffer
         Buffer of output boxes with class and score.
-
     sort_index : Buffer
         Buffer of output box indexes sorted by score.
-
     valid_count : Buffer
         Buffer of number of valid output boxes.
-
     indices : Buffer
         Buffer represents the index of box in original data.
-
     out : Buffer
         Output buffer.
-
     max_output_size : int
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
-
     iou_threshold : float
         Overlapping(IoU) threshold to suppress object with smaller score.
-
     force_suppress : boolean
         Whether to suppress all detections regardless of class_id.
-
     top_k : int
         Keep maximum top k detections before nms, -1 for no limit.
-
     coord_start : int
         Start index of the consecutive 4 coordinates.
-
     id_index : int
         index of the class categories, -1 to disable.
-
     score_index : optional, int
         Index of the scores/confidence of boxes.
-
     Returns
     -------
     stmt : Stmt
@@ -304,7 +307,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     data = ib.buffer_ptr(data)
     sorted_index = ib.buffer_ptr(sorted_index)
     valid_count = ib.buffer_ptr(valid_count)
-    indices = ib.buffer_ptr(indices)
     out = ib.buffer_ptr(out)
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.allocate(
@@ -411,7 +413,6 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                         coord_start=2, score_index=1, id_index=0,
                         return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
-
     Parameters
     ----------
     data : tvm.te.Tensor
@@ -419,55 +420,41 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
         It could be the second output out_tensor of get_valid_counts.
-
     valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes. It could be the output
         valid_count of get_valid_counts.
-
     indices : tvm.te.Tensor
         2-D tensor with shape [batch_size, num_anchors], represents
         the index of box in original data. It could be the third
         output out_indices of get_valid_counts. The values in the
         second dimension are like the output of arange(num_anchors)
         if get_valid_counts is not used before non_max_suppression.
-
     max_output_size : optional, int
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
-
     iou_threshold : optional, float
         Non-maximum suppression threshold.
-
     force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
-
     top_k : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
-
     coord_start : required, int
         Start index of the consecutive 4 coordinates.
-
     score_index : optional, int
         Index of the scores/confidence of boxes.
-
     id_index : optional, int
         index of the class categories, -1 to disable.
-
     return_indices : boolean
         Whether to return box indices in input data.
-
     invalid_to_bottom : optional, boolean
         Whether to move all valid bounding boxes to the top.
-
     Returns
     -------
     out : tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, elem_length].
-
     Example
     --------
     .. code-block:: python
-
         # An example to use nms
         dshape = (1, 5, 6)
         data = te.placeholder(dshape, name="data")
@@ -528,4 +515,4 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
     if return_indices:
         return rearrange_indices_out(box_indices)
 
-    return out
+    return out
\ No newline at end of file

From bb3b15d0096e3ae7147517512febeb7447969dd3 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nucflow>
Date: Mon, 27 Jul 2020 09:47:24 +0800
Subject: [PATCH 08/18] only rearrangel output

---
 tests/python/relay/test_op_level5.py |  3 ++
 topi/python/topi/cuda/nms.py         | 45 ++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 328a1f27e22d..c465741b7226 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -270,6 +270,9 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
+            # get_valid_count for cuda, opencl doesn't do data rearrangement
+            if target in ['cuda', 'opencl']:
+                continue
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)
 
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 364591da71f4..20327ce6eba6 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -56,20 +56,27 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also prepares to move valid boxes to the
     top of input data.
+
     Parameters
     ----------
     data : Buffer
         Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
+
     valid_count : Buffer
         1D buffer for valid number of boxes with shape [batch_size, ].
+
     flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
+
     score_threshold : float32
         Lower limit of score for valid bounding boxes.
+
     id_index : optional, int
         index of the class categories, -1 to disable.
+
     score_index: optional, int
         Index of the scores/confidence of boxes.
+
     Returns
     -------
     stmt : Stmt
@@ -130,20 +137,26 @@ def get_valid_counts_ir(data, valid_count, out, out_indices,
 def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
+
     Parameters
     ----------
     data : tvm.te.Tensor
         Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].
+
     score_threshold : optional, float
         Lower limit of score for valid bounding boxes.
+
     id_index : optional, int
         index of the class categories, -1 to disable.
+
     score_index: optional, int
         Index of the scores/confidence of boxes.
+
     Returns
     -------
     valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
+
     out_tensor : tvm.te.Tensor
         Rearranged data tensor.
     """
@@ -252,33 +265,46 @@ def nms_ir(data, sorted_index, valid_count, indices, out, box_indices,
            max_output_size, iou_threshold, force_suppress,
            top_k, coord_start, id_index, score_index):
     """Low level IR routing for transform location in multibox_detection operator.
+
     Parameters
     ----------
     data : Buffer
         Buffer of output boxes with class and score.
+
     sort_index : Buffer
         Buffer of output box indexes sorted by score.
+
     valid_count : Buffer
         Buffer of number of valid output boxes.
+
     indices : Buffer
         Buffer represents the index of box in original data.
+
     out : Buffer
         Output buffer.
+
     max_output_size : int
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
+
     iou_threshold : float
         Overlapping(IoU) threshold to suppress object with smaller score.
+
     force_suppress : boolean
         Whether to suppress all detections regardless of class_id.
+
     top_k : int
         Keep maximum top k detections before nms, -1 for no limit.
+
     coord_start : int
         Start index of the consecutive 4 coordinates.
+
     id_index : int
         index of the class categories, -1 to disable.
+
     score_index : optional, int
         Index of the scores/confidence of boxes.
+
     Returns
     -------
     stmt : Stmt
@@ -308,6 +334,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     sorted_index = ib.buffer_ptr(sorted_index)
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
+    indices = ib.buffer_ptr(indices)
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.allocate(
         "int32", (1,), name="num_valid_boxes", scope="local")
@@ -413,6 +440,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                         coord_start=2, score_index=1, id_index=0,
                         return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
+
     Parameters
     ----------
     data : tvm.te.Tensor
@@ -420,41 +448,55 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
         It could be the second output out_tensor of get_valid_counts.
+
     valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes. It could be the output
         valid_count of get_valid_counts.
+
     indices : tvm.te.Tensor
         2-D tensor with shape [batch_size, num_anchors], represents
         the index of box in original data. It could be the third
         output out_indices of get_valid_counts. The values in the
         second dimension are like the output of arange(num_anchors)
         if get_valid_counts is not used before non_max_suppression.
+
     max_output_size : optional, int
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
+
     iou_threshold : optional, float
         Non-maximum suppression threshold.
+
     force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
+
     top_k : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
+
     coord_start : required, int
         Start index of the consecutive 4 coordinates.
+
     score_index : optional, int
         Index of the scores/confidence of boxes.
+
     id_index : optional, int
         index of the class categories, -1 to disable.
+
     return_indices : boolean
         Whether to return box indices in input data.
+
     invalid_to_bottom : optional, boolean
         Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, elem_length].
+
     Example
     --------
     .. code-block:: python
+
         # An example to use nms
         dshape = (1, 5, 6)
         data = te.placeholder(dshape, name="data")
@@ -511,8 +553,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
             in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
             name="nms",
             tag="nms")
-    # TODO(yongwww): Update cuda nms to be consistent with cpu version
     if return_indices:
         return rearrange_indices_out(box_indices)
 
-    return out
\ No newline at end of file
+    return out

From 796cc75bdd2a24feb70f187d85ed01da6071ff33 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Sat, 29 Aug 2020 10:02:40 +0800
Subject: [PATCH 09/18] fix conflict

---
 tests/python/relay/test_op_level5.py |  51 ++++++++------
 topi/python/topi/cuda/nms.py         | 101 ++-------------------------
 2 files changed, 36 insertions(+), 116 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index c465741b7226..bfbef0db048b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -23,7 +23,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import ctx_list, run_infer_type
-import topi.testing
+import tvm.topi.testing
 
 
 def test_resize_infer_type():
@@ -41,19 +41,21 @@ def test_resize_infer_type():
     assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
 
 def test_resize():
-    def verify_resize(dshape, scale, method, layout):
+    def verify_resize(dshape, scale, method, layout, coord_trans):
         if layout == "NHWC":
             size = (dshape[1] * scale, dshape[2] * scale)
         else:
             size = (dshape[2] * scale, dshape[3] * scale)
 
         x_data = np.random.uniform(size=dshape).astype("float32")
+
         if method == "bilinear":
-            ref_res = topi.testing.bilinear_resize_python(x_data, size, layout)
+            ref_res = tvm.topi.testing.bilinear_resize_python(x_data, size, layout, coord_trans)
         else:
-            ref_res = topi.testing.upsampling_python(x_data, (scale, scale), layout)
+            ref_res = tvm.topi.testing.upsampling_python(x_data, (scale, scale), layout)
         x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.image.resize(x, size, layout, method, "align_corners")
+        z = relay.image.resize(x, size, layout, method,
+                              coordinate_transformation_mode=coord_trans)
         assert "size=" in z.astext()
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
@@ -63,10 +65,13 @@ def verify_resize(dshape, scale, method, layout):
             for kind in ["graph", "debug"]:
                 intrp = relay.create_executor(kind, ctx=ctx, target=target)
                 op_res = intrp.evaluate(func)(x_data)
-                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-6)
-    for method in ["bilinear", "nearest_neighbor"]:
-        for layout in ["NHWC", "NCHW"]:
-            verify_resize((1, 4, 4, 4), 2, method, layout)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-5)
+
+    for layout in ["NHWC", "NCHW"]:
+        verify_resize((1, 4, 4, 4), 2, "bilinear", layout, "align_corners")
+        verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "half_pixel")
+        verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "asymmetric")
+        verify_resize((3, 4, 5, 6), 5, "nearest_neighbor", layout, "asymmetric")
 
 def test_resize3d_infer_type():
     n, c, d, h, w = te.size_var("n"), te.size_var("c"), te.size_var("d"), te.size_var("h"), te.size_var("w")
@@ -91,9 +96,9 @@ def verify_resize(dshape, scale, method, layout):
 
         x_data = np.random.uniform(size=dshape).astype("float32")
         if method == "trilinear":
-            ref_res = topi.testing.trilinear_resize3d_python(x_data, size, layout)
+            ref_res = tvm.topi.testing.trilinear_resize3d_python(x_data, size, layout)
         else:
-            ref_res = topi.testing.upsampling3d_python(x_data, (scale, scale, scale), layout)
+            ref_res = tvm.topi.testing.upsampling3d_python(x_data, (scale, scale, scale), layout)
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         z = relay.image.resize3d(x, size, layout, method, "align_corners")
         assert "size=" in z.astext()
@@ -116,7 +121,7 @@ def verify_crop_and_resize(img_shape, boxes, box_indices, crop_size,
 
         image_data = np.random.uniform(size=img_shape).astype("float32")
 
-        ref_res = topi.testing.crop_and_resize_python(image_data,
+        ref_res = tvm.topi.testing.crop_and_resize_python(image_data,
                                                       boxes,
                                                       box_indices,
                                                       crop_size,
@@ -272,7 +277,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
             # get_valid_count for cuda, opencl doesn't do data rearrangement
             if target in ['cuda', 'opencl']:
-                continue
+                return
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)
 
@@ -321,6 +326,8 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+            if target == 'cuda':
+                return
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
@@ -461,7 +468,7 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
         np_data = np.random.uniform(size=data_shape).astype("float32")
         np_rois = np.random.uniform(size=rois_shape).astype('float32') * in_size
         np_rois[:, 0] = np.random.randint(low = 0, high = batch, size = num_roi)
-        ref_res = topi.testing.roi_align_nchw_python(np_data, np_rois, pooled_size=pooled_size,
+        ref_res = tvm.topi.testing.roi_align_nchw_python(np_data, np_rois, pooled_size=pooled_size,
                                                      spatial_scale=spatial_scale,
                                                      sample_ratio=sample_ratio)
         for target, ctx in ctx_list():
@@ -493,7 +500,7 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
         np_data = np.random.uniform(size=data_shape).astype("float32")
         np_rois = np.random.uniform(size=rois_shape).astype('float32') * in_size
         np_rois[:, 0] = np.random.randint(low = 0, high = batch, size = num_roi).astype('float32')
-        ref_res = topi.testing.roi_pool_nchw_python(np_data, np_rois, pooled_size=pooled_size,
+        ref_res = tvm.topi.testing.roi_pool_nchw_python(np_data, np_rois, pooled_size=pooled_size,
                                                      spatial_scale=spatial_scale)
         for target, ctx in ctx_list():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
@@ -588,7 +595,7 @@ def verify_yolo_reorg(shape, stride, out_shape):
 def test_yolo_reorg():
     def verify_yolo_reorg(shape, stride):
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
-        ref_res = topi.testing.reorg_python(x_data, stride)
+        ref_res = tvm.topi.testing.reorg_python(x_data, stride)
 
         x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.vision.yolo_reorg(x, stride=stride)
@@ -656,7 +663,7 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups):
         data = np.random.uniform(size=data_shape).astype(dtype)
         offset = np.random.uniform(size=offset_shape).astype(dtype)
         kernel = np.random.uniform(size=kernel_shape).astype(dtype)
-        ref_res = topi.testing.deformable_conv2d_nchw_python(data, offset, kernel, stride=(1, 1), padding=(1, 1), dilation=(1, 1), deformable_groups=deformable_groups, groups=groups)
+        ref_res = tvm.topi.testing.deformable_conv2d_nchw_python(data, offset, kernel, stride=(1, 1), padding=(1, 1), dilation=(1, 1), deformable_groups=deformable_groups, groups=groups)
 
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
@@ -677,7 +684,7 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
         x_data = np.random.uniform(size=dshape).astype("float32")
         if layout == "NHWC":
             x_data = np.transpose(x_data, axes=[0, 3, 1, 2])
-        ref_res = topi.testing.depth_to_space_python(x_data, block_size, mode=mode)
+        ref_res = tvm.topi.testing.depth_to_space_python(x_data, block_size, mode=mode)
         if layout == "NHWC":
             x_data = np.transpose(x_data, axes=[0, 2, 3, 1])
             ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1])
@@ -709,7 +716,7 @@ def verify_space_to_depth(dshape, block_size, layout):
         x_data = np.random.uniform(size=dshape).astype("float32")
         if layout == "NHWC":
             x_data = np.transpose(x_data, axes=[0, 3, 1, 2])
-        ref_res = topi.testing.space_to_depth_python(x_data, block_size)
+        ref_res = tvm.topi.testing.space_to_depth_python(x_data, block_size)
         if layout == "NHWC":
             x_data = np.transpose(x_data, axes=[0, 2, 3, 1])
             ref_res = np.transpose(ref_res, axes=[0, 2, 3, 1])
@@ -848,7 +855,7 @@ def verify_affine_grid(num_batch, target_shape):
 
         func = relay.Function([data], y)
         data_np = np.random.uniform(size=data_shape).astype(dtype)
-        ref_res = topi.testing.affine_grid_python(data_np, target_shape)
+        ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape)
 
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
@@ -874,7 +881,7 @@ def verify_grid_sample(data_shape, grid_shape):
 
         data_np = np.random.uniform(size=data_shape).astype(dtype)
         grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype)
-        ref_res = topi.testing.grid_sample_nchw_python(data_np, grid_np, method='bilinear')
+        ref_res = tvm.topi.testing.grid_sample_nchw_python(data_np, grid_np, method='bilinear')
 
         for target, ctx in ctx_list():
             for kind in ["graph", "debug"]:
@@ -908,4 +915,4 @@ def verify_grid_sample(data_shape, grid_shape):
     test_dilation2d_infer_type()
     test_dilation2d_run()
     test_affine_grid()
-    test_grid_sample()
+    test_grid_sample()
\ No newline at end of file
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 20327ce6eba6..98702eff3326 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -184,84 +184,7 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     return [valid_count, out, out_indices]
 
 
-def rearrange_indices_out_ir(data, output, valid_box_count):
-    """Low level IR to get rearrange_indices_out.
-    Parameters
-    ----------
-    data : Buffer
-        Input data. 2-D Buffer with shape [batch_size, num_anchors].
-
-    output: Buffer
-        2-D Buffer with shape [batch_size, num_anchors].
-
-    valid_box_count : Buffer
-        2-D Buffer with shape [batch_size, 1].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    ib = tvm.tir.ir_builder.create()
-
-    data = ib.buffer_ptr(data)
-    output = ib.buffer_ptr(output)
-    valid_box_count = ib.buffer_ptr(valid_box_count)
-
-    nthread_tx = batch_size
-    nthread_bx = 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = tx
-
-    valid_box_count[tid] = 0
-    with ib.for_range(0, num_anchors) as anchor_ind:
-        output[tid * num_anchors + anchor_ind] = data[tid * num_anchors + anchor_ind]
-    return ib.get()
-
-
-def rearrange_indices_out(data):
-    """Rearrange nms output to move all valid entries to top.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        NMS output.  2-D
-        tensor with shape [batch_size, num_anchors].
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        2-D tensor with shape [batch_size, num_anchors].
-
-    valid_box_count : tvm.te.Tensor or numpy NDArray
-        Tensor with shape [batch_size, 1], indicates
-        the valid number of boxes.
-    """
-    batch_size = data.shape[0]
-    data_buf = tvm.tir.decl_buffer(
-        data.shape, data.dtype, "data_buf", data_alignment=8)
-    out_indices_buf = tvm.tir.decl_buffer(
-        data.shape, data.dtype, "out_indices_buf", data_alignment=8)
-    valid_count_buf = tvm.tir.decl_buffer(
-        (batch_size, 1), "int32", "valid_count_buf", data_alignment=8)
-
-    output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape],
-                                        [data],
-                                        lambda ins, outs: rearrange_indices_out_ir(
-                                            ins[0], outs[0], outs[1]),
-                                        in_buffers=[data_buf],
-                                        out_buffers=[out_indices_buf, valid_count_buf],
-                                        name="rearrange_indices_out",
-                                        tag="rearrange_indices_out_gpu")
-    return [output, valid_box_count]
-
-
-def nms_ir(data, sorted_index, valid_count, indices, out, box_indices,
+def nms_ir(data, sorted_index, valid_count, out, box_indices,
            max_output_size, iou_threshold, force_suppress,
            top_k, coord_start, id_index, score_index):
     """Low level IR routing for transform location in multibox_detection operator.
@@ -277,9 +200,6 @@ def nms_ir(data, sorted_index, valid_count, indices, out, box_indices,
     valid_count : Buffer
         Buffer of number of valid output boxes.
 
-    indices : Buffer
-        Buffer represents the index of box in original data.
-
     out : Buffer
         Output buffer.
 
@@ -334,7 +254,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     sorted_index = ib.buffer_ptr(sorted_index)
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
-    indices = ib.buffer_ptr(indices)
     box_indices = ib.buffer_ptr(box_indices)
     num_valid_boxes = ib.allocate(
         "int32", (1,), name="num_valid_boxes", scope="local")
@@ -427,10 +346,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                         box_indices[i * num_anchors + j] = -1
                     with ib.else_scope():
                         num_valid_boxes[0] += 1
-        # convert box_indices to represent original data
-        org_idx = box_indices[i * num_anchors + j]
-        with ib.if_scope(box_indices[i * num_anchors + j] >= 0):
-            box_indices[i * num_anchors + j] = indices[i * num_anchors + org_idx]
 
     return ib.get()
 
@@ -539,21 +454,19 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
     data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
 
-    indices_buf = tvm.tir.decl_buffer(
-        indices.shape, indices.dtype, "indices_buf", data_alignment=8)
-
     out, box_indices = \
         te.extern([data.shape, score_shape],
-                  [data, sort_tensor, valid_count, indices],
+                  [data, sort_tensor, valid_count],
                   lambda ins, outs: nms_ir(
-            ins[0], ins[1], ins[2], ins[3], outs[0], outs[1],
+            ins[0], ins[1], ins[2], outs[0], outs[1],
             max_output_size, iou_threshold, force_suppress,
             top_k, coord_start, id_index, score_index),
             dtype=[data.dtype, "int32"],
-            in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
+            in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
             name="nms",
             tag="nms")
+    # TODO(yongwww): Update cuda nms to be consistent with cpu version
     if return_indices:
-        return rearrange_indices_out(box_indices)
+        return box_indices
 
-    return out
+    return out
\ No newline at end of file

From 9f300ac2bb4d420de0c7a623eb1784a5dc5858da Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Sun, 30 Aug 2020 14:54:29 +0800
Subject: [PATCH 10/18] add rearrange_indices_out

---
 python/tvm/topi/cuda/nms.py | 85 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 98702eff3326..dc61826dc2e5 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -184,6 +184,88 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     return [valid_count, out, out_indices]
 
 
+def rearrange_indices_out_ir(data, output, valid_box_count):
+    """Low level IR to get rearrange_indices_out.
+    Parameters
+    ----------
+    data : Buffer
+        Input data. 2-D Buffer with shape [batch_size, num_anchors].
+
+    output: Buffer
+        2-D Buffer with shape [batch_size, num_anchors].
+
+    valid_box_count : Buffer
+        2-D Buffer with shape [batch_size, 1].
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    ib = tvm.tir.ir_builder.create()
+
+    data = ib.buffer_ptr(data)
+    output = ib.buffer_ptr(output)
+    valid_box_count = ib.buffer_ptr(valid_box_count)
+
+    nthread_tx = batch_size
+    nthread_bx = 1
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = tx
+
+    neg_one = tvm.tir.const(-1, dtype=output.dtype)    
+    valid_box_count[tid] = 0
+    with ib.for_range(0, num_anchors) as anchor_ind:
+        output[tid * num_anchors + anchor_ind] = neg_one
+    with ib.for_range(0, num_anchors) as anchor_ind:
+        with ib.if_scope(data[tid * num_anchors + anchor_ind] >= 0):
+            output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind]
+            valid_box_count[tid] = valid_box_count[tid] + 1        
+    return ib.get()
+
+
+def rearrange_indices_out(data):
+    """Rearrange nms output to move all valid entries to top.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor or numpy NDArray
+        NMS output.  2-D
+        tensor with shape [batch_size, num_anchors].
+
+    Returns
+    -------
+    output : tvm.te.Tensor or numpy NDArray
+        2-D tensor with shape [batch_size, num_anchors].
+
+    valid_box_count : tvm.te.Tensor or numpy NDArray
+        Tensor with shape [batch_size, 1], indicates
+        the valid number of boxes.
+    """
+    batch_size = data.shape[0]
+    data_buf = tvm.tir.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
+    out_indices_buf = tvm.tir.decl_buffer(
+        data.shape, data.dtype, "out_indices_buf", data_alignment=8)
+    valid_count_buf = tvm.tir.decl_buffer(
+        (batch_size, 1), "int32", "valid_count_buf", data_alignment=8)
+
+    output, valid_box_count = te.extern([out_indices_buf.shape, valid_count_buf.shape],
+                                        [data],
+                                        lambda ins, outs: rearrange_indices_out_ir(
+                                            ins[0], outs[0], outs[1]),
+                                        in_buffers=[data_buf],
+                                        out_buffers=[out_indices_buf, valid_count_buf],
+                                        name="rearrange_indices_out",
+                                        tag="rearrange_indices_out_gpu")
+    return [output, valid_box_count]
+
+
 def nms_ir(data, sorted_index, valid_count, out, box_indices,
            max_output_size, iou_threshold, force_suppress,
            top_k, coord_start, id_index, score_index):
@@ -465,8 +547,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
             in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
             name="nms",
             tag="nms")
-    # TODO(yongwww): Update cuda nms to be consistent with cpu version
     if return_indices:
-        return box_indices
+        return rearrange_indices_out(box_indices)
 
     return out
\ No newline at end of file

From 572fbc6c57dbce609674831f0cdcfe26de7d24ac Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Sun, 30 Aug 2020 15:12:09 +0800
Subject: [PATCH 11/18] fix sanity check

---
 python/tvm/topi/cuda/nms.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index dc61826dc2e5..9d58a6f0399a 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -218,14 +218,14 @@ def rearrange_indices_out_ir(data, output, valid_box_count):
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = tx
 
-    neg_one = tvm.tir.const(-1, dtype=output.dtype)    
+    neg_one = tvm.tir.const(-1, dtype=output.dtype)
     valid_box_count[tid] = 0
     with ib.for_range(0, num_anchors) as anchor_ind:
         output[tid * num_anchors + anchor_ind] = neg_one
     with ib.for_range(0, num_anchors) as anchor_ind:
         with ib.if_scope(data[tid * num_anchors + anchor_ind] >= 0):
             output[tid * num_anchors + valid_box_count[tid]] = data[tid * num_anchors + anchor_ind]
-            valid_box_count[tid] = valid_box_count[tid] + 1        
+            valid_box_count[tid] = valid_box_count[tid] + 1
     return ib.get()
 
 
@@ -550,4 +550,4 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
     if return_indices:
         return rearrange_indices_out(box_indices)
 
-    return out
\ No newline at end of file
+    return out

From b05f37dd17b3c9a5473cd1601331ece5c381a6f9 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Sun, 30 Aug 2020 15:14:26 +0800
Subject: [PATCH 12/18] add new line

---
 tests/python/relay/test_op_level5.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index bfbef0db048b..254bab5e1692 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -915,4 +915,4 @@ def verify_grid_sample(data_shape, grid_shape):
     test_dilation2d_infer_type()
     test_dilation2d_run()
     test_affine_grid()
-    test_grid_sample()
\ No newline at end of file
+    test_grid_sample()

From 84b9d737f249b8695eb6bda37158e4a0efcc869c Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Wed, 9 Sep 2020 10:56:34 +0800
Subject: [PATCH 13/18] debuging

---
 python/tvm/relay/backend/compile_engine.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index f60335a4d44b..4f883ac7b870 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -121,6 +121,9 @@ def get_valid_implementations(op, attrs, inputs, out_type, target):
     assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name
     with target:
         strategy = fstrategy(attrs, inputs, out_type, target)
+    print('strategy.specializations: ', strategy.specializations)
+    for spec in strategy.specializations:
+        print(spec.implementations[0].name)
     analyzer = tvm.arith.Analyzer()
     ret = []
     for spec in strategy.specializations:
@@ -179,6 +182,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
     ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
         The best op implementation and the corresponding output tensors.
     """
+    print('target: ', target)
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
 
     best_plevel_impl = max(all_impls, key=lambda x: x.plevel)

From 1f5a0e0767ead8902c1fdce92952244be6e4c58b Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Wed, 9 Sep 2020 10:58:24 +0800
Subject: [PATCH 14/18] sync with remote

---
 3rdparty/vta-hw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 9b178fdb387b..db65157208ec 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 9b178fdb387bffc708f2448a82e85b4737239aed
+Subproject commit db65157208ec8fabb7b548c94596211b9db04190

From 88f7867aea85ce3f3779eff0046ff8ef898ec3c4 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Fri, 11 Sep 2020 10:26:41 +0800
Subject: [PATCH 15/18] add nms test for gpu

---
 python/tvm/relay/backend/compile_engine.py   | 15 +++---
 tests/python/relay/test_op_level5.py         | 48 +++++++++++++++++++-
 tests/python/topi/python/test_topi_vision.py |  8 +---
 3 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 987d5ee85089..41be47b78fd0 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -24,7 +24,7 @@
 from tvm import te
 from tvm.runtime import Object
 from tvm.support import libinfo
-from ... import target as _target
+from ...target import Target
 from ... import autotvm
 from .. import function as _function
 from .. import ty as _ty
@@ -33,9 +33,11 @@
 logger = logging.getLogger('compile_engine')
 autotvm_logger = logging.getLogger('autotvm')
 
+
 @tvm._ffi.register_object("relay.LoweredOutput")
 class LoweredOutput(Object):
     """Lowered output"""
+
     def __init__(self, outputs, implement):
         self.__init_handle_by_constructor__(
             _backend._make_LoweredOutput, outputs, implement)
@@ -53,6 +55,7 @@ class CCacheKey(Object):
     target : tvm.Target
         The target we want to run the function on.
     """
+
     def __init__(self, source_func, target):
         self.__init_handle_by_constructor__(
             _backend._make_CCacheKey, source_func, target)
@@ -67,7 +70,7 @@ class CCacheValue(Object):
 def _get_cache_key(source_func, target):
     if isinstance(source_func, _function.Function):
         if isinstance(target, str):
-            target = _target.create(target)
+            target = Target(target)
             if not target:
                 raise ValueError("Need target when source_func is a Function")
         return CCacheKey(source_func, target)
@@ -125,9 +128,6 @@ def get_valid_implementations(op, attrs, inputs, out_type, target):
     assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name
     with target:
         strategy = fstrategy(attrs, inputs, out_type, target)
-    print('strategy.specializations: ', strategy.specializations)
-    for spec in strategy.specializations:
-        print(spec.implementations[0].name)
     analyzer = tvm.arith.Analyzer()
     ret = []
     for spec in strategy.specializations:
@@ -186,7 +186,6 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
     ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
         The best op implementation and the corresponding output tensors.
     """
-    print('target: ', target)
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
 
     best_plevel_impl = max(all_impls, key=lambda x: x.plevel)
@@ -267,7 +266,8 @@ def lower_call(call, inputs, target):
         new_fields = []
         for field in ret_type.fields:
             if isinstance(field, _ty.TensorType):
-                new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype))
+                new_fields.append(_ty.TensorType(
+                    get_shape(field.shape), field.dtype))
             else:
                 new_fields.append(field)
         ret_type = _ty.TupleType(new_fields)
@@ -303,6 +303,7 @@ def lower_call(call, inputs, target):
 class CompileEngine(Object):
     """CompileEngine to get lowered code.
     """
+
     def __init__(self):
         raise RuntimeError("Cannot construct a CompileEngine")
 
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 25e9ac0ce5a8..b5821c884bfa 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -333,7 +333,7 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
             if target == 'cuda':
-                return
+                continue
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
             tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
@@ -372,6 +372,51 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
                np_indices_result, top_k=2)
 
 
+@tvm.testing.uses_gpu
+def test_non_max_suppression_gpu():
+    def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
+                   ref_indices_res, iou_threshold=0.5, force_suppress=True,
+                   top_k=-1, check_type_only=False):
+        x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
+        x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32"))
+        x2 = relay.var("x2", relay.ty.TensorType((dshape[0], dshape[1]), "int32"))
+        x3 = relay.var("x3", relay.ty.TensorType((), "int32"))
+        z_indices = relay.vision.non_max_suppression(x0, x1, x2, x3, \
+                    iou_threshold=iou_threshold, force_suppress=force_suppress, \
+                    top_k=top_k, return_indices=True)
+        if isinstance(z_indices, relay.expr.TupleWrapper):
+            z_indices = z_indices.astuple()
+        zz_indices = run_infer_type(z_indices)
+        
+        func_indices = relay.Function([x0, x1, x2, x3], z_indices)
+        func_indices = run_infer_type(func_indices)
+        for target, ctx in ctx_list():
+            if target != 'cuda':
+                continue
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
+            print('op_indices_res1[0]: \n', op_indices_res1[0])
+            print('op_indices_res1[1]: \n', op_indices_res1[1])
+
+    # data after get_valid_counts
+    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], 
+                         [1, 0.7, 2, 21, 26, 45],
+                         [-1, -1, -1, -1, -1, -1], 
+                         [2, 0.9, 35, 61, 52, 79],
+                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_indices = np.array([[0, 1, -1, 3, 4]]).astype("int32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_max_output_size = -1
+    num_anchors = 5
+    dshape = (1, num_anchors, 6)
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
+                           [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
+    verify_nms(np_data, np_valid_count, np_indices, 
+               np_max_output_size, dshape, np_result,
+               np_indices_result)
+
 @tvm.testing.uses_gpu
 def test_multibox_transform_loc():
     def test_default_value():
@@ -926,6 +971,7 @@ def verify_grid_sample(data_shape, grid_shape):
     test_yolo_reorg_infer_shape()
     test_yolo_reorg()
     test_non_max_suppression()
+    test_non_max_suppression_gpu()
     test_deformable_conv2d()
     test_depth_to_space()
     test_space_to_depth()
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 691dcdfaf926..a8aef452090a 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -167,12 +167,8 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
         tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        if device == 'llvm':
-            f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
-            f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
-        else:
-            f = tvm.build(indices_s, [data, valid_count, indices, indices_out], device)
-            f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
+        f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
         tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
     for device in ['llvm', 'cuda', 'opencl']:

From baa80344e703444e8c6085bfab88caf6acb7612d Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Fri, 11 Sep 2020 11:39:57 +0800
Subject: [PATCH 16/18] fix test

---
 tests/python/relay/test_op_level5.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index c2564e4e4577..a4eb00e7c3cb 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -390,13 +390,15 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
         
         func_indices = relay.Function([x0, x1, x2, x3], z_indices)
         func_indices = run_infer_type(func_indices)
-        for target, ctx in ctx_list():
+        for target, ctx in tvm.testing.enabled_targets():
             if target != 'cuda':
                 continue
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
-            print('op_indices_res1[0]: \n', op_indices_res1[0])
-            print('op_indices_res1[1]: \n', op_indices_res1[1])
+            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
+            tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
+            op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
+            tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5)
 
     # data after get_valid_counts
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], 

From fb2ccaccfdf85a8115240fa45d2bacf67ebf73bc Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Mon, 14 Sep 2020 20:11:26 +0800
Subject: [PATCH 17/18] sync with upstream

---
 python/tvm/topi/cuda/nms.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index b070d939895a..4f868ad880da 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -194,7 +194,6 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     return [valid_count, out, out_indices]
 
 
-<<<<<<< HEAD
 def rearrange_indices_out_ir(data, output, valid_box_count):
     """Low level IR to get rearrange_indices_out.
     Parameters
@@ -277,10 +276,6 @@ def rearrange_indices_out(data):
     return [output, valid_box_count]
 
 
-def nms_ir(data, sorted_index, valid_count, out, box_indices,
-           max_output_size, iou_threshold, force_suppress,
-           top_k, coord_start, id_index, score_index):
-=======
 def nms_ir(
     data,
     sorted_index,
@@ -295,7 +290,6 @@ def nms_ir(
     id_index,
     score_index,
 ):
->>>>>>> upstream/master
     """Low level IR routing for transform location in multibox_detection operator.
 
     Parameters

From 8f69efc09018cac769ac11c890bbc61878ffc546 Mon Sep 17 00:00:00 2001
From: lisiyuan <lisiyuan@nuctech>
Date: Tue, 15 Sep 2020 08:57:13 +0800
Subject: [PATCH 18/18] fix nms gpu test

---
 tests/python/relay/test_op_level5.py | 59 +++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 54d5ee23855f..2dd42b84df26 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -495,9 +495,19 @@ def verify_nms(
 
 @tvm.testing.uses_gpu
 def test_non_max_suppression_gpu():
-    def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
-                   ref_indices_res, iou_threshold=0.5, force_suppress=True,
-                   top_k=-1, check_type_only=False):
+    def verify_nms(
+        x0_data,
+        x1_data,
+        x2_data,
+        x3_data,
+        dshape,
+        ref_res,
+        ref_indices_res,
+        iou_threshold=0.5,
+        force_suppress=True,
+        top_k=-1,
+        check_type_only=False
+    ):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32"))
         x2 = relay.var("x2", relay.ty.TensorType((dshape[0], dshape[1]), "int32"))
@@ -522,23 +532,44 @@ def verify_nms(x0_data, x1_data, x2_data, x3_data, dshape, ref_res,
             tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5)
 
     # data after get_valid_counts
-    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], 
-                         [1, 0.7, 2, 21, 26, 45],
-                         [-1, -1, -1, -1, -1, -1], 
-                         [2, 0.9, 35, 61, 52, 79],
-                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_data = np.array(
+        [
+            [
+                [0, 0.8, 1, 20, 25, 45], 
+                [1, 0.7, 2, 21, 26, 45],
+                [-1, -1, -1, -1, -1, -1], 
+                [2, 0.9, 35, 61, 52, 79],
+                [1, 0.5, 100, 60, 70, 110]
+            ]
+        ]
+    ).astype("float32")
     np_indices = np.array([[0, 1, -1, 3, 4]]).astype("int32")
     np_valid_count = np.array([4]).astype("int32")
     np_max_output_size = -1
     num_anchors = 5
     dshape = (1, num_anchors, 6)
-    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
-                           [-1, -1, -1, -1, -1, -1]]])
+    np_result = np.array(
+        [
+            [
+                [2, 0.9, 35, 61, 52, 79],
+                [0, 0.8, 1, 20, 25, 45],
+                [-1, -1, -1, -1, -1, -1],
+                [-1, -1, -1, -1, -1, -1],
+                [-1, -1, -1, -1, -1, -1]
+            ]
+        ]
+    )
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
-    verify_nms(np_data, np_valid_count, np_indices, 
-               np_max_output_size, dshape, np_result,
-               np_indices_result)
+    verify_nms(
+        np_data,
+        np_valid_count,
+        np_indices,
+        np_max_output_size,
+        dshape, 
+        np_result,
+        np_indices_result,
+        top_k=2,
+    )
 
 @tvm.testing.uses_gpu
 def test_multibox_transform_loc():