From 2ab3bd15b5e53287c6768687d71833bd897440b1 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 26 Mar 2020 20:00:04 +0000
Subject: [PATCH 1/5] get_valid_count updated to have correct results

---
 topi/python/topi/cuda/nms.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index d295116e72fd..ecc8bcb371c9 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -44,7 +44,7 @@ def atomic_add(x, y):
     return tvm.tir.call_pure_intrin(y.dtype, "atomic_add", x, y)
 
 
-def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, score_index):
+def get_valid_counts_ir(data, valid_count, out, score_threshold, id_index, score_index):
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also prepares to move valid boxes to the
     top of input data.
@@ -83,10 +83,11 @@ def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, scor
     data = ib.buffer_ptr(data)
 
     valid_count = ib.buffer_ptr(valid_count)
-    flag = ib.buffer_ptr(flag)
+    out = ib.buffer_ptr(out)
     atomic_add_return = ib.allocate(
         valid_count.dtype, (1,), name='atomic_add_return', scope='local')
     one_count = tvm.tir.const(1, dtype=valid_count.dtype)
+    one = tvm.tir.const(1, dtype=out.dtype)
     score_threshold = tvm.ir.make_node(
         "FloatImm", dtype="float32", value=score_threshold)
     id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index)
@@ -106,17 +107,18 @@ def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, scor
     # initialize valid_count
     with ib.if_scope(tid < batch_size):
         valid_count[tid] = 0
-    # initialize flag
-    with ib.if_scope(tid < batch_size * num_anchors):
-        flag[tid] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
         with ib.if_scope(
                 tvm.tir.all(data[tid * elem_length + score_index] > score_threshold,
                             tvm.tir.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
-            flag[tid] = 1
             atomic_add_return[0] = atomic_add(tvm.tir.call_pure_intrin("handle", "tvm_address_of",
                                                                        valid_count[i]), one_count)
+            with ib.for_range(0, elem_length) as k:
+                out[tid * elem_length + k] = data[tid * elem_length + k]
+        with ib.else_scope():
+            with ib.for_range(0, elem_length) as k:
+                out[tid * elem_length + k] = -one
 
     return ib.get()
 
@@ -277,16 +279,16 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     out_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "out_buf", data_alignment=8)
 
-    valid_count, temp_flag = \
-        te.extern([(batch_size,), (batch_size, num_anchors)], [data],
+    valid_count, out = \
+        te.extern([(batch_size,), data.shape], [data],
                   lambda ins, outs: get_valid_counts_ir(
             ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
-            dtype=["int32", "int32"],
+            dtype=["int32", data.dtype],
             in_buffers=[data_buf],
-            out_buffers=[valid_count_buf, temp_flag_buf],
+            out_buffers=[valid_count_buf, out_buf],
             name="get_valid_counts",
             tag="get_valid_counts_gpu")
-
+    '''
     temp_partial = \
         te.extern([(batch_size, num_anchors)], [temp_flag],
                   lambda ins, outs: flag_scan(
@@ -305,6 +307,7 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
                         temp_partial_buf, valid_count_buf],
             out_buffers=[out_buf],
             name="out_rewrite")
+    '''
 
     return [valid_count, out]
 
@@ -670,7 +673,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
         score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
     if tvm.get_global_func("tvm.contrib.thrust.sort_nms", allow_missing=True):
         sort_tensor = argsort_thrust(
-            score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
+            score_tensor, valid_count=None, axis=1, is_ascend=False, dtype=valid_count_dtype)
     else:
         sort_tensor = argsort(
             score_tensor, valid_count=valid_count, axis=1, is_ascend=False)

From 4d85027c7a39bbbf50f7c76e741060871fa706e8 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 7 Apr 2020 21:22:13 +0000
Subject: [PATCH 2/5] speedup nms

---
 topi/python/topi/cuda/nms.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index ecc8bcb371c9..31cbde3a607c 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -272,10 +272,12 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
         data.shape, data.dtype, "data_buf", data_alignment=8)
     valid_count_buf = tvm.tir.decl_buffer(
         (batch_size,), "int32", "valid_count_buf", data_alignment=8)
+    '''
     temp_flag_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
     temp_partial_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8)
+    '''
     out_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "out_buf", data_alignment=8)
 
@@ -702,6 +704,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     if return_indices:
         return box_indices
 
+    '''
     if invalid_to_bottom:
         output_buf = tvm.tir.decl_buffer(
             data.shape, data.dtype, "output_buf", data_alignment=8)
@@ -727,5 +730,6 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
             name="invalid_to_bottom",
             tag="invalid_to_bottom")
         return output
+    '''
 
     return out

From 6fce2c92896e01fe541bc8131f9caf77d5630e05 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 14 Apr 2020 20:33:05 +0000
Subject: [PATCH 3/5] update nms

---
 python/tvm/relay/frontend/mxnet.py       |   1 -
 tests/python/relay/test_op_level5.py     |   2 +
 topi/python/topi/cuda/nms.py             | 288 +----------------------
 topi/python/topi/nn/deformable_conv2d.py |   2 +-
 topi/tests/python/test_topi_vision.py    |   4 +-
 5 files changed, 6 insertions(+), 291 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index b918f9b1adc5..09c824537440 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -842,7 +842,6 @@ def _mx_smooth_l1(inputs, attrs):
 
 def _mx_deformable_convolution(inputs, attrs):
     new_attrs = {}
-    assert attrs.get_bool("no_bias")
     new_attrs["kernel_size"] = attrs.get_int_tuple("kernel")
     new_attrs["strides"] = attrs.get_int_tuple("stride")
     new_attrs["padding"] = attrs.get_int_tuple("pad")
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 2dc549f2f354..457211baaf66 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -222,6 +222,8 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
         for target, ctx in ctx_list():
+            if target == 'cuda':
+                return
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 31cbde3a607c..13f97987094e 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -123,123 +123,6 @@ def get_valid_counts_ir(data, valid_count, out, score_threshold, id_index, score
     return ib.get()
 
 
-def flag_scan(flag, prefix_sum):
-    """Low level IR to calculate correct positions for valid boxes.
-
-    Parameters
-    ----------
-    flag : Buffer
-        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
-
-    prefix_sum : Buffer
-        2D Buffer of prefix sum of flags indicating new locations of valid boxes
-        with same shape as flag.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = flag.shape[0]
-    num_anchors = flag.shape[1]
-
-    ib = tvm.tir.ir_builder.create()
-
-    flag = ib.buffer_ptr(flag)
-    prefix_sum = ib.buffer_ptr(prefix_sum)
-
-    max_threads = int(tvm.target.Target.current(
-        allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = batch_size * num_anchors // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    # initialize prefix_sum
-    with ib.if_scope(tid < batch_size * num_anchors):
-        prefix_sum[tid] = 0
-    with ib.if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors)
-        j = idxm(tid, num_anchors)
-        with ib.for_range(0, j) as r:
-            prefix_sum[tid] += flag[i * num_anchors + r]
-
-    return ib.get()
-
-
-def out_rewrite(data, flag, prefix_sum, valid_count, out):
-    """Low level IR to move valid boxes to the
-    top of input data.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
-
-    flag : Buffer
-        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
-
-    prefix_sum : Buffer
-        2D Buffer of prefix sum of flags indicating new locations of valid boxes
-        with same shape as flag.
-
-    valid_count : Buffer
-        1D buffer for valid number of boxes with shape [batch_size, ].
-
-    out : Buffer
-        Rearranged data buffer.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = out.shape[0]
-    num_anchors = out.shape[1]
-    elem_length = out.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    one = tvm.tir.const(1, dtype=out.dtype)
-    data = ib.buffer_ptr(data)
-    flag = ib.buffer_ptr(flag)
-    valid_count = ib.buffer_ptr(valid_count)
-    prefix_sum = ib.buffer_ptr(prefix_sum)
-    out = ib.buffer_ptr(out)
-
-    max_threads = int(tvm.target.Target.current(
-        allow_none=False).max_num_threads)
-    nthread_tx = max_threads
-    nthread_bx = batch_size * num_anchors // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    with ib.if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors)
-        j = idxm(tid, num_anchors)
-        base_idx = i * num_anchors * elem_length
-        with ib.if_scope(tvm.tir.all(flag[tid] > 0, prefix_sum[tid] >= 0,
-                                     prefix_sum[tid] < num_anchors)):
-            with ib.for_range(0, elem_length) as k:
-                out[base_idx + prefix_sum[tid] * elem_length +
-                    k] = data[tid * elem_length + k]
-        with ib.if_scope(j >= valid_count[i]):
-            with ib.for_range(0, elem_length) as k:
-                out[tid * elem_length + k] = -one
-
-    return ib.get()
-
-
 def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
@@ -267,17 +150,10 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
         Rearranged data tensor.
     """
     batch_size = data.shape[0]
-    num_anchors = data.shape[1]
     data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
     valid_count_buf = tvm.tir.decl_buffer(
         (batch_size,), "int32", "valid_count_buf", data_alignment=8)
-    '''
-    temp_flag_buf = tvm.tir.decl_buffer(
-        (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
-    temp_partial_buf = tvm.tir.decl_buffer(
-        (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8)
-    '''
     out_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "out_buf", data_alignment=8)
 
@@ -290,26 +166,6 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
             out_buffers=[valid_count_buf, out_buf],
             name="get_valid_counts",
             tag="get_valid_counts_gpu")
-    '''
-    temp_partial = \
-        te.extern([(batch_size, num_anchors)], [temp_flag],
-                  lambda ins, outs: flag_scan(
-            ins[0], outs[0]),
-            dtype=["int32"],
-            in_buffers=[temp_flag_buf],
-            out_buffers=[temp_partial_buf],
-            name="flag_scan")
-
-    out = \
-        te.extern([data.shape], [data, temp_flag, temp_partial, valid_count],
-                  lambda ins, outs: out_rewrite(
-            ins[0], ins[1], ins[2], ins[3], outs[0]),
-            dtype=[data.dtype],
-            in_buffers=[data_buf, temp_flag_buf,
-                        temp_partial_buf, valid_count_buf],
-            out_buffers=[out_buf],
-            name="out_rewrite")
-    '''
 
     return [valid_count, out]
 
@@ -480,117 +336,6 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     return ib.get()
 
 
-def invalid_to_bottom_pre(data, flag, idx):
-    """Low level IR to rearrange nms output to move all valid entries to top.
-
-    Parameters
-    ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
-
-    flag : Buffer
-        1D Buffer of flag indicating valid data with [num_anchors].
-
-    idx : Buffer
-        1D Buffer of valid data indices with [num_anchors].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    data = ib.buffer_ptr(data)
-    flag = ib.buffer_ptr(flag)
-    idx = ib.buffer_ptr(idx)
-
-    max_threads = int(math.sqrt(
-        tvm.target.Target.current(allow_none=False).max_num_threads))
-    nthread_tx = max_threads
-    nthread_bx = num_anchors // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    j = bx * max_threads + tx
-
-    with ib.for_range(0, batch_size, for_type="unroll") as i:
-        base_idx = i * num_anchors * elem_length
-        with ib.if_scope(j < num_anchors):
-            with ib.if_scope(data[base_idx + j * elem_length] >= 0):
-                flag[i * num_anchors + j] = 1
-                idx[i * num_anchors + j] = 1
-            with ib.else_scope():
-                flag[i * num_anchors + j] = 0
-                idx[i * num_anchors + j] = 0
-
-    with ib.if_scope(j < batch_size):
-        with ib.for_range(0, num_anchors) as k:
-            with ib.if_scope(k > 0):
-                idx[j * num_anchors + k] += idx[j * num_anchors + k - 1]
-    return ib.get()
-
-
-def invalid_to_bottom_ir(data, flag, idx, out):
-    """Low level IR to rearrange nms output to move all valid entries to top.
-
-    Parameters
-    ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
-
-    flag : Buffer
-        1D Buffer of flag indicating valid data with [num_anchors].
-
-    idx : Buffer
-        1D Buffer of valid data indices with [num_anchors].
-
-    out : Buffer
-        3D Buffer of rearranged nms output with shape [batch_size, num_anchors, elem_length].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-
-    ib = tvm.tir.ir_builder.create()
-
-    data = ib.buffer_ptr(data)
-    flag = ib.buffer_ptr(flag)
-    idx = ib.buffer_ptr(idx)
-    out = ib.buffer_ptr(out)
-
-    max_threads = int(math.sqrt(
-        tvm.target.Target.current(allow_none=False).max_num_threads))
-    nthread_tx = max_threads
-    nthread_bx = num_anchors // max_threads + 1
-    tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    j = bx * max_threads + tx
-
-    with ib.for_range(0, batch_size, for_type="unroll") as i:
-        base_idx = i * num_anchors * elem_length
-        with ib.if_scope(j < num_anchors):
-            with ib.for_range(0, elem_length) as k:
-                out[base_idx + j * elem_length + k] = -1.0
-            with ib.if_scope(flag[i * num_anchors + j] > 0):
-                with ib.for_range(0, elem_length) as k:
-                    out[base_idx + (idx[i * num_anchors + j] - 1) * elem_length + k] \
-                        = data[base_idx + j * elem_length + k]
-    return ib.get()
-
-
 def non_max_suppression(data, valid_count, max_output_size=-1,
                         iou_threshold=0.5, force_suppress=False, top_k=-1,
                         coord_start=2, score_index=1, id_index=0,
@@ -678,7 +423,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
             score_tensor, valid_count=None, axis=1, is_ascend=False, dtype=valid_count_dtype)
     else:
         sort_tensor = argsort(
-            score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
+            score_tensor, valid_count=None, axis=1, is_ascend=False, dtype=valid_count_dtype)
 
     sort_tensor_buf = tvm.tir.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
                                           "sort_tensor_buf", data_alignment=8)
@@ -686,9 +431,6 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
 
-    out_buf = tvm.tir.decl_buffer(
-        data.shape, data.dtype, "out_buf", data_alignment=8)
-
     out, box_indices = \
         te.extern([data.shape, score_shape],
                   [data, sort_tensor, valid_count],
@@ -704,32 +446,4 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     if return_indices:
         return box_indices
 
-    '''
-    if invalid_to_bottom:
-        output_buf = tvm.tir.decl_buffer(
-            data.shape, data.dtype, "output_buf", data_alignment=8)
-        temp_flag_buf = tvm.tir.decl_buffer(
-            score_shape, valid_count_dtype, "temp_flag", data_alignment=8)
-        temp_idx_buf = tvm.tir.decl_buffer(
-            score_shape, valid_count_dtype, "temp_idx", data_alignment=8)
-        temp_flag, temp_idx = te.extern([score_shape, score_shape], [out],
-                                        lambda ins, outs: invalid_to_bottom_pre(
-            ins[0], outs[0], outs[1]),
-            dtype=["int32", "int32"],
-            in_buffers=[out_buf],
-            out_buffers=[
-            temp_flag_buf, temp_idx_buf],
-            name="invalid_to_bottom_phase_one")
-
-        output = te.extern([data.shape], [out, temp_flag, temp_idx],
-                           lambda ins, outs: invalid_to_bottom_ir(
-            ins[0], ins[1], ins[2], outs[0]),
-            dtype=[data.dtype],
-            in_buffers=[out_buf, temp_flag_buf, temp_idx_buf],
-            out_buffers=[output_buf],
-            name="invalid_to_bottom",
-            tag="invalid_to_bottom")
-        return output
-    '''
-
     return out
diff --git a/topi/python/topi/nn/deformable_conv2d.py b/topi/python/topi/nn/deformable_conv2d.py
index 9f95fd1ae790..39be6d6d861f 100644
--- a/topi/python/topi/nn/deformable_conv2d.py
+++ b/topi/python/topi/nn/deformable_conv2d.py
@@ -106,7 +106,7 @@ def _bilinear(n, c, h, w):
                                     (kh * kernel_w + kw) * 2, y, x],
                              x * stride_w - pad_left + kw * dilation_w +
                              offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
-                                    (kh * kernel_w + kw) * 2 + 1, y, x]))
+                                    (kh * kernel_w + kw) * 2 + 1, y, x]), tag="data_deform")
     return te.compute(
         (batch, out_channel, out_height, out_width),
         lambda n, f, y, x: te.sum(
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 0aa410d7ea13..54074e275711 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -104,8 +104,8 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
     for device in ['llvm', 'cuda', 'opencl']:
-        # Disable opencl test for now
-        if device != "llvm" and device != "cuda":
+        # Disable gpu test for now
+        if device != "llvm":
             continue
         check_device(device)
 

From 5aa473ea26ee41ac414865d6e2b75d25a1d5be33 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 14 Apr 2020 21:42:21 +0000
Subject: [PATCH 4/5] revert back nms

---
 topi/python/topi/cuda/nms.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 13f97987094e..d8be3bd1b886 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -17,7 +17,6 @@
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
 # pylint: disable=bad-continuation, unused-argument
 """Non-maximum suppression operator"""
-import math
 import tvm
 from tvm import te
 

From 43f1575ddcc8720d6795dd9aedfefb3b843f8bd0 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Wed, 15 Apr 2020 04:50:43 +0000
Subject: [PATCH 5/5] recover one test for get_valid_count

---
 tests/python/relay/test_op_level5.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 457211baaf66..b29b69667653 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -222,11 +222,12 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
         for target, ctx in ctx_list():
-            if target == 'cuda':
-                return
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
+            # get_valid_count for cuda doesn't do data rearrangement
+            if target == 'cuda':
+                return
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
 
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)