From ed42e85e9a902ff721cc84a3802f3e4da73cc981 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Wed, 12 Feb 2020 21:57:48 +0000
Subject: [PATCH 01/10] get_valid_count accuracy issue fixed for individual
 tests but not for all tests running together

---
 tests/python/relay/test_op_level5.py  |  18 +-
 topi/python/topi/cuda/nms.py          | 434 +++++++++-----------------
 topi/tests/python/test_topi_vision.py |  30 +-
 3 files changed, 177 insertions(+), 305 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 03e700b3df83..1436d46f4680 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -221,17 +221,15 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
         for target, ctx in ctx_list():
-            if target == 'cuda':
-                return
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
 
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
-    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
-    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
-    verify_get_valid_counts((16, 500, 5), 0.95, -1, 0)
+    #verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
+    #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
+    #verify_get_valid_counts((16, 500, 5), 0.95, -1, 0)
 
 
 def test_non_max_suppression():
@@ -673,11 +671,11 @@ def verify_space_to_depth(dshape, block_size, layout):
 
 
 if __name__ == "__main__":
-    test_resize_infer_type()
-    test_resize()
-    test_crop_and_resize()
-    test_multibox_prior()
-    test_multibox_transform_loc()
+    #test_resize_infer_type()
+    #test_resize()
+    #test_crop_and_resize()
+    #test_multibox_prior()
+    #test_multibox_transform_loc()
     test_get_valid_counts()
     test_roi_align()
     test_roi_pool()
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 38f87a9523c8..da22dd0b0289 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -28,15 +28,34 @@
 from .. import tag
 
 
-def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index):
-    """Low level IR to Prepare get valid count of bounding boxes
+def cuda_atomicAdd_rule(op):
+    if op.dtype == "float32":
+        return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
+    elif op.dtype == "float64":
+        return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
+    elif op.dtype == "int32":
+        return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
+    else:
+        raise RuntimeError("only support int32, float32 and float64")
+
+
+tvm.register_intrin_rule(
+    "cuda", "atomicAdd", cuda_atomicAdd_rule, override=True)
+
+
+def atomicAdd(x, y):
+    return tvm.call_pure_intrin(y.dtype, "atomicAdd", x, y)
+
+
+def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index, score_index):
+    """Low level IR to get valid count of bounding boxes
     given a score threshold. Also moves valid boxes to the
     top of input data.
 
     Parameters
     ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+    data : Buffer
+        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
 
     flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
@@ -44,14 +63,11 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index
     idx : Buffer
         2D Buffer of valid data indices with shape [batch_size, num_anchors].
 
-    score_threshold : float32
-        Lower limit of score for valid bounding boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
+    valid_count : Buffer
+        1-D buffer for valid number of boxes.
 
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
+    out : Buffer
+        Rearranged data buffer.
 
     Returns
     -------
@@ -60,14 +76,20 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
-    box_data_length = data.shape[2]
+    elem_length = data.shape[2]
 
     ib = tvm.ir_builder.create()
 
     data = ib.buffer_ptr(data)
-    flag = ib.buffer_ptr(flag)
-    idx = ib.buffer_ptr(idx)
-    score_threshold = tvm.make.node("FloatImm", dtype="float32", value=score_threshold)
+
+    valid_count = ib.buffer_ptr(valid_count)
+    Flag = ib.buffer_ptr(Flag)
+    out = ib.buffer_ptr(out)
+    atomicAdd_return = ib.allocate(
+        valid_count.dtype, (1,), name='atomicAdd_return', scope='local')
+    one_count = tvm.const(1, dtype=valid_count.dtype)
+    score_threshold = tvm.make.node(
+        "FloatImm", dtype="float32", value=score_threshold)
     id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
     score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
 
@@ -79,163 +101,36 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
+    idxd = tvm.indexdiv
 
     with ib.if_scope(tid < batch_size * num_anchors):
-        with ib.if_scope(tvm.all(data[tid * box_data_length + score_index] > score_threshold, \
-            tvm.any(id_index < 0, data[tid * box_data_length + id_index] >= 0))):
-            flag[tid] = 1
-            idx[tid] = 1
-        with ib.else_scope():
-            flag[tid] = 0
-            idx[tid] = 0
-
-    return ib.get()
-
-def get_valid_counts_upsweep(data, idx_in, idx, partial):
-    """Low level IR of first step of scan: unsweep.
-
-    Parameters
-    ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
-
-    idx_in : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    partial : Buffer
-        2D Buffer of valid data indices with shape [batch_size, new_range].
+        i = idxd(tid, num_anchors)
+        Flag[tid] = 0
+        base_idx = i * num_anchors * elem_length
+        with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold,
+                                 tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
+            Flag[tid] = 1
+            with ib.for_range(0, elem_length) as k:
+                out[base_idx + k] = data[base_idx + k]
+            atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of",
+                                                                 valid_count[i]), one_count)
 
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    ib = tvm.ir_builder.create()
-    data = ib.buffer_ptr(data)
-    idx_in = ib.buffer_ptr(idx_in)
-    idx = ib.buffer_ptr(idx)
-    partial = ib.buffer_ptr(partial)
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
-    nthread_tx = max_threads
-    nthread_bx = batch_size
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    new_range = num_anchors // elem_per_thread + 1
-    # Scan: Upsweep:
-    with ib.if_scope(tvm.all(bx < batch_size, tx < new_range)):
-        with ib.for_range(0, elem_per_thread) as i:
-            with ib.if_scope(bx * num_anchors + \
-                             tx * elem_per_thread + i < batch_size * num_anchors):
-                with ib.if_scope(i == 0):
-                    partial[bx * new_range + tx] = idx_in[bx * num_anchors + tx * elem_per_thread]
-                    idx[bx * num_anchors + tx * elem_per_thread] = \
-                    idx_in[bx * num_anchors + tx * elem_per_thread]
-                with ib.else_scope():
-                    partial[bx * new_range + tx] += \
-                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
-                    idx[bx * num_anchors + tx * elem_per_thread + i] = \
-                    idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \
-                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
-            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                  tvm.convert(['shared']),
-                                  tvm.expr.Call.Intrinsic, None, 0))
     return ib.get()
 
-def get_valid_counts_scan(data, partial_in, partial):
-    """Low level IR to do scan.
-
-    Parameters
-    ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
 
-    idx_in : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+def flag_scan(Flag, PrefixSum):
+    batch_size = Flag.shape[0]
+    num_anchors = Flag.shape[1]
 
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    partial : Buffer
-        2D Buffer of valid data indices with shape [batch_size, new_range].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
     ib = tvm.ir_builder.create()
-    partial_in = ib.buffer_ptr(partial_in)
-    partial = ib.buffer_ptr(partial)
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
-    nthread_tx = max_threads
-    nthread_bx = batch_size
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    var = tvm.make.node("FloatImm", dtype="float32", value=2)
-    new_range = num_anchors // elem_per_thread + 1
-    iteration = cast(log(cast(new_range, "float32")) / math.log(2), "int32")
-    # Scan: Kogge-Stone adder
-    with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))):
-        with ib.for_range(0, iteration) as k:
-            with ib.if_scope(k == 0):
-                with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))):
-                    partial[bx * new_range + tx] = \
-                    partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1]
-                with ib.else_scope():
-                    partial[bx * new_range] = partial_in[bx * new_range]
-            with ib.else_scope():
-                with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \
-                                         tx < tvm.min(new_range, num_anchors))):
-                    partial[bx * new_range + tx] += \
-                    partial[bx * new_range + tx - cast(power(var, k), "int32")]
-            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                  tvm.convert(['shared']),
-                                  tvm.expr.Call.Intrinsic, None, 0))
-    return ib.get()
 
-def get_valid_counts_downsweep(data, idx_in, partial, idx):
-    """Low level IR to do downsweep of scan.
+    Flag = ib.buffer_ptr(Flag)
+    PrefixSum = ib.buffer_ptr(PrefixSum)
+    atomicAdd_return = ib.allocate(
+        "int32", (1,), name='atomicAdd_return', scope='local')
 
-    Parameters
-    ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
-
-    idx_in : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    partial : Buffer
-        2D Buffer of valid data indices with shape [batch_size, new_range].
-
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    ib = tvm.ir_builder.create()
-    idx_in = ib.buffer_ptr(idx_in)
-    idx = ib.buffer_ptr(idx)
-    partial = ib.buffer_ptr(partial)
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
+    max_threads = int(tvm.target.current_target(
+        allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
@@ -243,69 +138,44 @@ def get_valid_counts_downsweep(data, idx_in, partial, idx):
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    new_range = num_anchors // elem_per_thread + 1
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
-    # Scan: Downsweep:
-    with ib. if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors) # number of batches
-        j = idxm(tid, num_anchors) # number of anchors
-        with ib.if_scope(j < elem_per_thread):
-            idx[tid] = idx_in[tid]
-        with ib.else_scope():
-            idx[tid] = idx_in[tid] + partial[i * new_range + idxd(j, elem_per_thread) - 1]
-
-    return ib.get()
-
-def get_valid_counts_ir(data, flag, idx, valid_count, out):
-    """Low level IR to get valid count of bounding boxes
-    given a score threshold. Also moves valid boxes to the
-    top of input data.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
 
-    flag : Buffer
-        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
+    with ib.if_scope(tid < batch_size * num_anchors):
+        i = idxd(tid, num_anchors)
+        j = idxm(tid, num_anchors)
+        with ib.for_range(0, j) as r:
+            # TODO: no difference for the following two
+            PrefixSum[tid] += Flag[i * num_anchors + r]
+            # atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of",
+            #                                                     PrefixSum[tid]), Flag[i * num_anchors + r])
 
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+    return ib.get()
 
-    valid_count : Buffer
-        1-D buffer for valid number of boxes.
 
-    out : Buffer
-        Rearranged data buffer.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-    size = batch_size * num_anchors * elem_length
+def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out):
+    batch_size = out.shape[0]
+    num_anchors = out.shape[1]
+    elem_length = out.shape[2]
 
     ib = tvm.ir_builder.create()
 
+    one = tvm.const(1, dtype=out.dtype)
     data = ib.buffer_ptr(data)
-    flag = ib.buffer_ptr(flag)
-    idx = ib.buffer_ptr(idx)
+    Flag = ib.buffer_ptr(Flag)
     valid_count = ib.buffer_ptr(valid_count)
+    PrefixSum = ib.buffer_ptr(PrefixSum)
+    out_in = ib.buffer_ptr(out_in)
     out = ib.buffer_ptr(out)
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
-    nthread_bx = batch_size * num_anchors * elem_length // max_threads + 1
+    nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
 
@@ -313,17 +183,15 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         base_idx = i * num_anchors * elem_length
-        with ib.if_scope(flag[tid] > 0):
+        out[tid] = out_in[tid]
+        with ib.if_scope(tvm.all(Flag[tid] > 0, PrefixSum[tid] >= 0, PrefixSum[tid] < num_anchors)):
+            with ib.for_range(0, elem_length) as k:
+                out[base_idx + PrefixSum[tid] * elem_length +
+                    k] = data[tid * elem_length + k]
+        with ib.if_scope(j >= valid_count[i]):
             with ib.for_range(0, elem_length) as k:
-                with ib.if_scope(base_idx + (idx[tid] - 1) * elem_length + k < size):
-                    out[base_idx + (idx[tid] - 1) * elem_length + k] =\
-                    data[base_idx + j * elem_length + k]
-        with ib.if_scope(j == 0):
-            valid_count[i] = idx[tid + num_anchors - 1]
-        with ib.if_scope(j >= idx[i * num_anchors + num_anchors - 1]):
-            with ib.for_range(0, elem_length) as l:
-                with ib.if_scope(tid * elem_length + l < size):
-                    out[tid * elem_length + l] = -1.0
+                out[tid * elem_length + k] = -one
+
     return ib.get()
 
 
@@ -356,56 +224,46 @@ def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1):
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
-    new_range = num_anchors // elem_per_thread + 1
+    data_buf = api.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
+    valid_count_buf = api.decl_buffer(
+        (batch_size,), "int32", "valid_count_buf", data_alignment=8)
+    temp_out_buf = api.decl_buffer(
+        data.shape, data.dtype, "temp_out_buf", data_alignment=8)
     temp_flag_buf = api.decl_buffer(
         (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
-    temp_idx_buf = api.decl_buffer(
-        (batch_size, num_anchors,), "int32", "temp_idx", data_alignment=8)
     temp_partial_buf = api.decl_buffer(
-        (batch_size, new_range), "int32", "temp_partial", data_alignment=8)
-    data_buf = api.decl_buffer(
-        data.shape, data.dtype, "data_buf", data_alignment=8)
-
-    temp_flag, temp_idx = \
-        tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data],
-                   lambda ins, outs: get_valid_counts_pre(
-                       ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
-                   dtype=["int32", "int32"],
-                   out_buffers=[temp_flag_buf, temp_idx_buf],
-                   name="get_valid_counts_phase_one")
-    temp_idx_new, temp_partial = \
-        tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx],
-                   lambda ins, outs: get_valid_counts_upsweep(
-                       ins[0], ins[1], outs[0], outs[1]),
-                   dtype=["int32", "int32"],
-                   out_buffers=[temp_idx_buf, temp_partial_buf],
-                   name="get_valid_counts_phase_two")
-    temp_partial_new = \
-        tvm.extern([(batch_size, new_range)], [data, temp_partial],
-                   lambda ins, outs: get_valid_counts_scan(
-                       ins[0], ins[1], outs[0]),
-                   dtype=["int32"],
-                   out_buffers=[temp_partial_buf],
-                   name="get_valid_counts_phase_three")
-    temp_idx_final = \
-        tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new],
-                   lambda ins, outs: get_valid_counts_downsweep(
-                       ins[0], ins[1], ins[2], outs[0]),
-                   dtype=["int32"],
-                   out_buffers=[temp_idx_buf],
-                   name="get_valid_counts_phase_four")
-    valid_count, out_tensor = \
-	tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final],
-               lambda ins, outs: get_valid_counts_ir(
-                ins[0], ins[1], ins[2], outs[0], outs[1]),
-            dtype=["int32", data.dtype],
-            in_buffers=[data_buf, temp_flag_buf, temp_idx_buf],
-            name="get_valid_counts_phase_five",
+        (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8)
+
+    valid_count, temp_flag, out_in = \
+        tvm.extern([(batch_size,), (batch_size, num_anchors), data.shape], [data],
+                   lambda ins, outs: get_valid_counts_ir(
+            ins[0], outs[0], outs[1], outs[2], score_threshold, id_index, score_index),
+            dtype=["int32", "int32", data.dtype],
+            in_buffers=[data_buf],
+            out_buffers=[valid_count_buf, temp_flag_buf, temp_out_buf],
+            name="get_valid_counts",
             tag="get_valid_counts_gpu")
 
-    return [valid_count, out_tensor]
+    temp_partial = \
+        tvm.extern([(batch_size, num_anchors)], [temp_flag],
+                   lambda ins, outs: flag_scan(
+            ins[0], outs[0]),
+            dtype=["int32"],
+            in_buffers=[temp_flag_buf],
+            out_buffers=[temp_partial_buf],
+            name="flag_scan")
+
+    out = \
+        tvm.extern([data.shape], [data, temp_flag, temp_partial, out_in, valid_count],
+                   lambda ins, outs: out_rewrite(
+            ins[0], ins[1], ins[2], ins[3], ins[4], outs[0]),
+            dtype=[data.dtype],
+            in_buffers=[data_buf, temp_flag_buf,
+                        temp_partial_buf, temp_out_buf, valid_count_buf],
+            name="out_rewrite")
+
+    return [valid_count, out]
 
 
 def nms_ir(data, sorted_index, valid_count, out, box_indices,
@@ -479,7 +337,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
     box_indices = ib.buffer_ptr(box_indices)
-    num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")
+    num_valid_boxes = ib.allocate(
+        "int32", (1,), name="num_valid_boxes", scope="local")
 
     max_threads = int(
         tvm.target.Target.current(allow_none=False).max_num_threads)
@@ -491,26 +350,29 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     j = bx * max_threads + tx
 
-    iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold)
+    iou_threshold = tvm.make.node(
+        "FloatImm", dtype="float32", value=iou_threshold)
     top_k = tvm.make.node("IntImm", dtype="int32", value=top_k)
     coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start)
     id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
     score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
-    force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
+    force_suppress = tvm.make.node(
+        "IntImm", dtype="int32", value=1 if force_suppress else 0)
 
     with ib.for_range(0, batch_size, for_type="unroll") as i:
         base_idx = i * num_anchors * box_data_length
         with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)):
             # Reorder output
-            nkeep = if_then_else( \
-                    tvm.all(top_k > 0, top_k < valid_count[i]),
-                    top_k, valid_count[i])
+            nkeep = if_then_else(
+                tvm.all(top_k > 0, top_k < valid_count[i]),
+                top_k, valid_count[i])
             with ib.if_scope(j < nkeep):
                 with ib.for_range(0, box_data_length) as k:
                     out[(base_idx + j * box_data_length + k)] = \
-                    data[(base_idx + sorted_index[i * num_anchors + j] \
-                    * box_data_length + k)]
-                box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j]
+                        data[(base_idx + sorted_index[i * num_anchors + j]
+                              * box_data_length + k)]
+                box_indices[i * num_anchors +
+                            j] = sorted_index[i * num_anchors + j]
             with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])):
                 with ib.if_scope(j < valid_count[i] - nkeep):
                     with ib.for_range(0, box_data_length) as k:
@@ -519,16 +381,17 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             # Apply nms
             with ib.for_range(0, valid_count[i]) as k:
                 offset_k = k * box_data_length
-                with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0, \
-                    tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))):
+                with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0,
+                                         tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))):
                     with ib.if_scope(j < valid_count[i]):
                         offset_j = j * box_data_length
-                        with ib.if_scope(tvm.all(j > k, \
-                            out[base_idx + offset_j + score_index] > 0, \
-                                                 tvm.any(id_index < 0, \
-                                                    out[base_idx + offset_j + id_index] >= 0), \
-						 tvm.any(force_suppress > 0, id_index < 0, \
-                                                         out[base_idx + offset_k + id_index] == \
+                        with ib.if_scope(tvm.all(j > k,
+                                                 out[base_idx + offset_j +
+                                                     score_index] > 0,
+                                                 tvm.any(id_index < 0,
+                                                         out[base_idx + offset_j + id_index] >= 0),
+                                                 tvm.any(force_suppress > 0, id_index < 0,
+                                                         out[base_idx + offset_k + id_index] ==
                                                          out[base_idx + offset_j + id_index]))):
                             iou = calculate_overlap(out, base_idx + offset_j + coord_start,
                                                     base_idx + offset_k + coord_start)
@@ -541,12 +404,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             with ib.if_scope(j < valid_count[i]):
                 offset_j = j * box_data_length
                 with ib.for_range(0, box_data_length) as k:
-                    out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k]
+                    out[(base_idx + offset_j + k)
+                        ] = data[base_idx + offset_j + k]
                 box_indices[i * num_anchors + j] = j
         # Set invalid entry to be -1
         with ib.if_scope(j < num_anchors - valid_count[i]):
             with ib.for_range(0, box_data_length) as k:
-                out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0
+                out[base_idx + (j + valid_count[i]) *
+                    box_data_length + k] = -1.0
             box_indices[i * num_anchors + j + valid_count[i]] = -1
         # Only return max_output_size number of valid boxes
         num_valid_boxes[0] = 0
@@ -671,7 +536,7 @@ def invalid_to_bottom_ir(data, flag, idx, out):
             with ib.if_scope(flag[i * num_anchors + j] > 0):
                 with ib.for_range(0, elem_length) as k:
                     out[base_idx + (idx[i * num_anchors + j] - 1) * elem_length + k] \
-                    = data[base_idx + j * elem_length + k]
+                        = data[base_idx + j * elem_length + k]
     return ib.get()
 
 
@@ -756,8 +621,10 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
                                       "valid_count_buf", data_alignment=4)
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
-    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
+    score_tensor = tvm.compute(
+        score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
+    sort_tensor = argsort(
+        score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
 
     sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
                                       "sort_tensor_buf", data_alignment=8)
@@ -795,7 +662,8 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
                                              ins[0], outs[0], outs[1]),
                                          dtype=["int32", "int32"],
                                          in_buffers=[out_buf],
-                                         out_buffers=[temp_flag_buf, temp_idx_buf],
+                                         out_buffers=[
+                                             temp_flag_buf, temp_idx_buf],
                                          name="invalid_to_bottom_phase_one")
 
         output = tvm.extern([data.shape], [out, temp_flag, temp_idx],
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index a081f0797dad..4e3bce4aa94e 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -63,21 +63,27 @@ def check_device(device):
         tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
         f = tvm.build(s, [data, outs[0], outs[1]], device)
         f(tvm_input_data, tvm_out1, tvm_out2)
+        import sys
+        np.set_printoptions(threshold=sys.maxsize)
+        #print(tvm_out2.asnumpy())
+        #print("====================================")
+        #print(np_out2)
+        #print("===============diff================")
+        #print(tvm_out2.asnumpy() - np_out2)
         tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
-    for device in ['llvm', 'cuda', 'opencl']:
+    for device in ['cuda', 'opencl']:
         # Disable gpu test for now
-        if device != "llvm":
-            continue
         check_device(device)
 
 
 def test_get_valid_counts():
-    verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
-    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
-    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
-    verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)
+    verify_get_valid_counts((1, 122640, 6), 0.01, 0, 1)
+    #verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
+    #verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
+    #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
+    #verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)
 
 
 def verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, iou_threshold,
@@ -424,8 +430,8 @@ def test_proposal():
 
 if __name__ == "__main__":
     test_get_valid_counts()
-    test_non_max_suppression()
-    test_multibox_prior()
-    test_multibox_detection()
-    test_roi_align()
-    test_proposal()
+    #test_non_max_suppression()
+    #test_multibox_prior()
+    #test_multibox_detection()
+    #test_roi_align()
+    #test_proposal()

From ffac02f8d20ecb54e054b9c52edc0ff89d02eb03 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 14 Feb 2020 21:52:46 +0000
Subject: [PATCH 02/10] minor fix

---
 topi/python/topi/cuda/nms.py          | 48 ++++++++++++---------------
 topi/tests/python/test_topi_vision.py |  2 +-
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index da22dd0b0289..6ba97e87da0c 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -39,7 +39,7 @@ def cuda_atomicAdd_rule(op):
         raise RuntimeError("only support int32, float32 and float64")
 
 
-tvm.register_intrin_rule(
+tvm.target.intrin.register_intrin_rule(
     "cuda", "atomicAdd", cuda_atomicAdd_rule, override=True)
 
 
@@ -47,7 +47,7 @@ def atomicAdd(x, y):
     return tvm.call_pure_intrin(y.dtype, "atomicAdd", x, y)
 
 
-def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index, score_index):
+def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, score_index):
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also moves valid boxes to the
     top of input data.
@@ -84,7 +84,6 @@ def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index,
 
     valid_count = ib.buffer_ptr(valid_count)
     Flag = ib.buffer_ptr(Flag)
-    out = ib.buffer_ptr(out)
     atomicAdd_return = ib.allocate(
         valid_count.dtype, (1,), name='atomicAdd_return', scope='local')
     one_count = tvm.const(1, dtype=valid_count.dtype)
@@ -93,7 +92,8 @@ def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index,
     id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
     score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
 
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    max_threads = int(tvm.target.Target.current(
+        allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
@@ -106,14 +106,13 @@ def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index,
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
         Flag[tid] = 0
-        base_idx = i * num_anchors * elem_length
         with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold,
                                  tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
             Flag[tid] = 1
-            with ib.for_range(0, elem_length) as k:
-                out[base_idx + k] = data[base_idx + k]
             atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of",
                                                                  valid_count[i]), one_count)
+    with ib.else_scope():
+        Flag[tid] = 0
 
     return ib.get()
 
@@ -126,10 +125,8 @@ def flag_scan(Flag, PrefixSum):
 
     Flag = ib.buffer_ptr(Flag)
     PrefixSum = ib.buffer_ptr(PrefixSum)
-    atomicAdd_return = ib.allocate(
-        "int32", (1,), name='atomicAdd_return', scope='local')
 
-    max_threads = int(tvm.target.current_target(
+    max_threads = int(tvm.target.Target.current(
         allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
@@ -145,15 +142,12 @@ def flag_scan(Flag, PrefixSum):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         with ib.for_range(0, j) as r:
-            # TODO: no difference for the following two
             PrefixSum[tid] += Flag[i * num_anchors + r]
-            # atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of",
-            #                                                     PrefixSum[tid]), Flag[i * num_anchors + r])
 
     return ib.get()
 
 
-def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out):
+def out_rewrite(data, Flag, PrefixSum, valid_count, out):
     batch_size = out.shape[0]
     num_anchors = out.shape[1]
     elem_length = out.shape[2]
@@ -165,10 +159,10 @@ def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out):
     Flag = ib.buffer_ptr(Flag)
     valid_count = ib.buffer_ptr(valid_count)
     PrefixSum = ib.buffer_ptr(PrefixSum)
-    out_in = ib.buffer_ptr(out_in)
     out = ib.buffer_ptr(out)
 
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    max_threads = int(tvm.target.Target.current(
+        allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
@@ -183,7 +177,6 @@ def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         base_idx = i * num_anchors * elem_length
-        out[tid] = out_in[tid]
         with ib.if_scope(tvm.all(Flag[tid] > 0, PrefixSum[tid] >= 0, PrefixSum[tid] < num_anchors)):
             with ib.for_range(0, elem_length) as k:
                 out[base_idx + PrefixSum[tid] * elem_length +
@@ -228,20 +221,20 @@ def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1):
         data.shape, data.dtype, "data_buf", data_alignment=8)
     valid_count_buf = api.decl_buffer(
         (batch_size,), "int32", "valid_count_buf", data_alignment=8)
-    temp_out_buf = api.decl_buffer(
-        data.shape, data.dtype, "temp_out_buf", data_alignment=8)
     temp_flag_buf = api.decl_buffer(
         (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
     temp_partial_buf = api.decl_buffer(
         (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8)
+    out_buf = api.decl_buffer(
+        data.shape, data.dtype, "out_buf", data_alignment=8)
 
-    valid_count, temp_flag, out_in = \
-        tvm.extern([(batch_size,), (batch_size, num_anchors), data.shape], [data],
+    valid_count, temp_flag = \
+        tvm.extern([(batch_size,), (batch_size, num_anchors)], [data],
                    lambda ins, outs: get_valid_counts_ir(
-            ins[0], outs[0], outs[1], outs[2], score_threshold, id_index, score_index),
-            dtype=["int32", "int32", data.dtype],
+            ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
+            dtype=["int32", "int32"],
             in_buffers=[data_buf],
-            out_buffers=[valid_count_buf, temp_flag_buf, temp_out_buf],
+            out_buffers=[valid_count_buf, temp_flag_buf],
             name="get_valid_counts",
             tag="get_valid_counts_gpu")
 
@@ -255,12 +248,13 @@ def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1):
             name="flag_scan")
 
     out = \
-        tvm.extern([data.shape], [data, temp_flag, temp_partial, out_in, valid_count],
+        tvm.extern([data.shape], [data, temp_flag, temp_partial, valid_count],
                    lambda ins, outs: out_rewrite(
-            ins[0], ins[1], ins[2], ins[3], ins[4], outs[0]),
+            ins[0], ins[1], ins[2], ins[3], outs[0]),
             dtype=[data.dtype],
             in_buffers=[data_buf, temp_flag_buf,
-                        temp_partial_buf, temp_out_buf, valid_count_buf],
+                        temp_partial_buf, valid_count_buf],
+            out_buffers=[out_buf],
             name="out_rewrite")
 
     return [valid_count, out]
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 4e3bce4aa94e..0207de801648 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -80,7 +80,7 @@ def check_device(device):
 
 def test_get_valid_counts():
     verify_get_valid_counts((1, 122640, 6), 0.01, 0, 1)
-    #verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
+    verify_get_valid_counts((1, 125000, 6), 0, 0, 1)
     #verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
     #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
     #verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)

From e1dfe59b9d7a5998ccd9a1c20f0bbd735b17f4d9 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 17 Feb 2020 22:46:21 +0000
Subject: [PATCH 03/10] initialize valid_count and PrefixSum buffers

---
 topi/python/topi/cuda/nms.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 6ba97e87da0c..46ad335e04cc 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -103,6 +103,8 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
     tid = bx * max_threads + tx
     idxd = tvm.indexdiv
 
+    with ib.if_scope(tid < batch_size):
+        valid_count[tid] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
         Flag[tid] = 0
@@ -111,8 +113,6 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
             Flag[tid] = 1
             atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of",
                                                                  valid_count[i]), one_count)
-    with ib.else_scope():
-        Flag[tid] = 0
 
     return ib.get()
 
@@ -138,6 +138,8 @@ def flag_scan(Flag, PrefixSum):
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
 
+    with ib.if_scope(tid < batch_size * num_anchors):
+        PrefixSum[tid] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)

From e1e8d6f946daec399f578fbf7d96c7abeac0cefc Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 17 Feb 2020 23:25:11 +0000
Subject: [PATCH 04/10] test updated

---
 topi/python/topi/cuda/nms.py          |  6 ++++-
 topi/tests/python/test_topi_vision.py | 32 +++++++++++----------------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 46ad335e04cc..67a7e9458b60 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -103,11 +103,14 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
     tid = bx * max_threads + tx
     idxd = tvm.indexdiv
 
+    # initialize valid_count
     with ib.if_scope(tid < batch_size):
         valid_count[tid] = 0
+    # initialize Flag
     with ib.if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors)
         Flag[tid] = 0
+    with ib.if_scope(tid < batch_size * num_anchors):
+        i = idxd(tid, num_anchors)
         with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold,
                                  tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
             Flag[tid] = 1
@@ -138,6 +141,7 @@ def flag_scan(Flag, PrefixSum):
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
 
+    # initialize PrefixSum
     with ib.if_scope(tid < batch_size * num_anchors):
         PrefixSum[tid] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 0207de801648..85e4180a0892 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -63,27 +63,21 @@ def check_device(device):
         tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
         f = tvm.build(s, [data, outs[0], outs[1]], device)
         f(tvm_input_data, tvm_out1, tvm_out2)
-        import sys
-        np.set_printoptions(threshold=sys.maxsize)
-        #print(tvm_out2.asnumpy())
-        #print("====================================")
-        #print(np_out2)
-        #print("===============diff================")
-        #print(tvm_out2.asnumpy() - np_out2)
         tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
-    for device in ['cuda', 'opencl']:
-        # Disable gpu test for now
+    for device in ['llvm', 'cuda', 'opencl']:
+        # Disable opencl test for now
+        if device != "llvm" and device != "cuda":
+            continue
         check_device(device)
 
 
 def test_get_valid_counts():
-    verify_get_valid_counts((1, 122640, 6), 0.01, 0, 1)
-    verify_get_valid_counts((1, 125000, 6), 0, 0, 1)
-    #verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
-    #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
-    #verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)
+    verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
+    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
+    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
+    verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)
 
 
 def verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, iou_threshold,
@@ -430,8 +424,8 @@ def test_proposal():
 
 if __name__ == "__main__":
     test_get_valid_counts()
-    #test_non_max_suppression()
-    #test_multibox_prior()
-    #test_multibox_detection()
-    #test_roi_align()
-    #test_proposal()
+    test_non_max_suppression()
+    test_multibox_prior()
+    test_multibox_detection()
+    test_roi_align()
+    test_proposal()

From a0795d3e010762ec6f670e93eee868cc3a9b0907 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 17 Feb 2020 23:30:52 +0000
Subject: [PATCH 05/10] udpate relay test as well

---
 tests/python/relay/test_op_level5.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 1436d46f4680..e622a8ae01ab 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -227,9 +227,9 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
 
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
-    #verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
-    #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
-    #verify_get_valid_counts((16, 500, 5), 0.95, -1, 0)
+    verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
+    verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
+    verify_get_valid_counts((16, 500, 5), 0.95, -1, 0)
 
 
 def test_non_max_suppression():
@@ -671,11 +671,11 @@ def verify_space_to_depth(dshape, block_size, layout):
 
 
 if __name__ == "__main__":
-    #test_resize_infer_type()
-    #test_resize()
-    #test_crop_and_resize()
-    #test_multibox_prior()
-    #test_multibox_transform_loc()
+    test_resize_infer_type()
+    test_resize()
+    test_crop_and_resize()
+    test_multibox_prior()
+    test_multibox_transform_loc()
     test_get_valid_counts()
     test_roi_align()
     test_roi_pool()

From 48738bd79db42f4e392dbd229fcd7c79ecce2e45 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 17 Feb 2020 23:38:39 +0000
Subject: [PATCH 06/10] update document

---
 topi/python/topi/cuda/nms.py | 59 +++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 67a7e9458b60..0ab3cfd7b76f 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -49,7 +49,7 @@ def atomicAdd(x, y):
 
 def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, score_index):
     """Low level IR to get valid count of bounding boxes
-    given a score threshold. Also moves valid boxes to the
+    given a score threshold. Also prepares to move valid boxes to the
     top of input data.
 
     Parameters
@@ -57,17 +57,20 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
     data : Buffer
         Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
 
-    flag : Buffer
+    valid_count : Buffer
+        1D buffer for valid number of boxes with shape [batch_size, ].
+
+    Flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+    score_threshold : float32
+        Lower limit of score for valid bounding boxes.
 
-    valid_count : Buffer
-        1-D buffer for valid number of boxes.
+    id_index : optional, int
+        index of the class categories, -1 to disable.	
 
-    out : Buffer
-        Rearranged data buffer.
+    score_index: optional, int
+        Index of the scores/confidence of boxes.
 
     Returns
     -------
@@ -121,6 +124,21 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
 
 
 def flag_scan(Flag, PrefixSum):
+    """Low level IR to calculate correct positions for valid boxes.
+
+    Parameters
+    ----------
+    Flag : Buffer
+        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
+
+    PrefixSum : Buffer
+        2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
     batch_size = Flag.shape[0]
     num_anchors = Flag.shape[1]
 
@@ -154,6 +172,31 @@ def flag_scan(Flag, PrefixSum):
 
 
 def out_rewrite(data, Flag, PrefixSum, valid_count, out):
+    """Low level IR to move valid boxes to the
+    top of input data.
+
+    Parameters
+    ----------
+    data : Buffer
+        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
+
+    Flag : Buffer
+        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
+
+    PrefixSum : Buffer
+        2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag.
+
+    valid_count : Buffer
+        1D buffer for valid number of boxes with shape [batch_size, ].
+
+    out : Buffer
+        Rearranged data buffer.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
     batch_size = out.shape[0]
     num_anchors = out.shape[1]
     elem_length = out.shape[2]

From 94e87f5ed88c63bcffecab9fded365244b0925c2 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 18 Feb 2020 00:46:51 +0000
Subject: [PATCH 07/10] fix lint

---
 topi/python/topi/cuda/nms.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 0ab3cfd7b76f..bc5227165eac 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -21,8 +21,7 @@
 import tvm
 
 from tvm import api
-from tvm.generic import cast
-from tvm.intrin import if_then_else, log, power
+from tvm.intrin import if_then_else
 from topi.vision import non_max_suppression, get_valid_counts
 from .sort import argsort
 from .. import tag
@@ -31,12 +30,11 @@
 def cuda_atomicAdd_rule(op):
     if op.dtype == "float32":
         return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
-    elif op.dtype == "float64":
+    if op.dtype == "float64":
         return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
-    elif op.dtype == "int32":
+    if op.dtype == "int32":
         return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
-    else:
-        raise RuntimeError("only support int32, float32 and float64")
+    raise RuntimeError("only support int32, float32 and float64")
 
 
 tvm.target.intrin.register_intrin_rule(
@@ -67,7 +65,7 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
         Lower limit of score for valid bounding boxes.
 
     id_index : optional, int
-        index of the class categories, -1 to disable.	
+        index of the class categories, -1 to disable.
 
     score_index: optional, int
         Index of the scores/confidence of boxes.
@@ -132,7 +130,8 @@ def flag_scan(Flag, PrefixSum):
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
     PrefixSum : Buffer
-        2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag.
+        2D Buffer of prefix sum of flags indicating new locations of valid boxes
+        with same shape as Flag.
 
     Returns
     -------
@@ -184,7 +183,8 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out):
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
     PrefixSum : Buffer
-        2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag.
+        2D Buffer of prefix sum of flags indicating new locations of valid boxes
+        with same shape as Flag.
 
     valid_count : Buffer
         1D buffer for valid number of boxes with shape [batch_size, ].
@@ -425,7 +425,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             with ib.for_range(0, valid_count[i]) as k:
                 offset_k = k * box_data_length
                 with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0,
-                                         tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))):
+                                         tvm.any(id_index < 0, out[base_idx +
+                                                                   offset_k + id_index] >= 0))):
                     with ib.if_scope(j < valid_count[i]):
                         offset_j = j * box_data_length
                         with ib.if_scope(tvm.all(j > k,

From 03d4cc797a409f43872b3d29a5a2a982d8046d7b Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Wed, 19 Feb 2020 02:47:42 +0000
Subject: [PATCH 08/10] address comment

---
 topi/python/topi/cuda/nms.py | 70 ++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index bc5227165eac..e4880005dabb 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -27,25 +27,25 @@
 from .. import tag
 
 
-def cuda_atomicAdd_rule(op):
+def cuda_atomic_add_rule(op):
     if op.dtype == "float32":
-        return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
+        return tvm.call_pure_extern("float32", "atomic_add", op.args[0], op.args[1])
     if op.dtype == "float64":
-        return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
+        return tvm.call_pure_extern("float64", "atomic_add", op.args[0], op.args[1])
     if op.dtype == "int32":
-        return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
+        return tvm.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
     raise RuntimeError("only support int32, float32 and float64")
 
 
 tvm.target.intrin.register_intrin_rule(
-    "cuda", "atomicAdd", cuda_atomicAdd_rule, override=True)
+    "cuda", "atomic_add", cuda_atomic_add_rule, override=True)
 
 
-def atomicAdd(x, y):
-    return tvm.call_pure_intrin(y.dtype, "atomicAdd", x, y)
+def atomic_add(x, y):
+    return tvm.call_pure_intrin(y.dtype, "atomic_add", x, y)
 
 
-def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, score_index):
+def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, score_index):
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also prepares to move valid boxes to the
     top of input data.
@@ -58,7 +58,7 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
     valid_count : Buffer
         1D buffer for valid number of boxes with shape [batch_size, ].
 
-    Flag : Buffer
+    flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
     score_threshold : float32
@@ -84,9 +84,9 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
     data = ib.buffer_ptr(data)
 
     valid_count = ib.buffer_ptr(valid_count)
-    Flag = ib.buffer_ptr(Flag)
-    atomicAdd_return = ib.allocate(
-        valid_count.dtype, (1,), name='atomicAdd_return', scope='local')
+    flag = ib.buffer_ptr(flag)
+    atomic_add_return = ib.allocate(
+        valid_count.dtype, (1,), name='atomic_add_return', scope='local')
     one_count = tvm.const(1, dtype=valid_count.dtype)
     score_threshold = tvm.make.node(
         "FloatImm", dtype="float32", value=score_threshold)
@@ -107,44 +107,44 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor
     # initialize valid_count
     with ib.if_scope(tid < batch_size):
         valid_count[tid] = 0
-    # initialize Flag
+    # initialize flag
     with ib.if_scope(tid < batch_size * num_anchors):
-        Flag[tid] = 0
+        flag[tid] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
         with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold,
                                  tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
-            Flag[tid] = 1
-            atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of",
+            flag[tid] = 1
+            atomic_add_return[0] = atomic_add(tvm.call_pure_intrin("handle", "tvm_address_of",
                                                                  valid_count[i]), one_count)
 
     return ib.get()
 
 
-def flag_scan(Flag, PrefixSum):
+def flag_scan(flag, prefix_sum):
     """Low level IR to calculate correct positions for valid boxes.
 
     Parameters
     ----------
-    Flag : Buffer
+    flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
-    PrefixSum : Buffer
+    prefix_sum : Buffer
         2D Buffer of prefix sum of flags indicating new locations of valid boxes
-        with same shape as Flag.
+        with same shape as flag.
 
     Returns
     -------
     stmt : Stmt
         The result IR statement.
     """
-    batch_size = Flag.shape[0]
-    num_anchors = Flag.shape[1]
+    batch_size = flag.shape[0]
+    num_anchors = flag.shape[1]
 
     ib = tvm.ir_builder.create()
 
-    Flag = ib.buffer_ptr(Flag)
-    PrefixSum = ib.buffer_ptr(PrefixSum)
+    flag = ib.buffer_ptr(flag)
+    prefix_sum = ib.buffer_ptr(prefix_sum)
 
     max_threads = int(tvm.target.Target.current(
         allow_none=False).max_num_threads)
@@ -158,19 +158,19 @@ def flag_scan(Flag, PrefixSum):
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
 
-    # initialize PrefixSum
+    # initialize prefix_sum
     with ib.if_scope(tid < batch_size * num_anchors):
-        PrefixSum[tid] = 0
+        prefix_sum[tid] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         with ib.for_range(0, j) as r:
-            PrefixSum[tid] += Flag[i * num_anchors + r]
+            prefix_sum[tid] += flag[i * num_anchors + r]
 
     return ib.get()
 
 
-def out_rewrite(data, Flag, PrefixSum, valid_count, out):
+def out_rewrite(data, flag, prefix_sum, valid_count, out):
     """Low level IR to move valid boxes to the
     top of input data.
 
@@ -179,12 +179,12 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out):
     data : Buffer
         Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
 
-    Flag : Buffer
+    flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
-    PrefixSum : Buffer
+    prefix_sum : Buffer
         2D Buffer of prefix sum of flags indicating new locations of valid boxes
-        with same shape as Flag.
+        with same shape as flag.
 
     valid_count : Buffer
         1D buffer for valid number of boxes with shape [batch_size, ].
@@ -205,9 +205,9 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out):
 
     one = tvm.const(1, dtype=out.dtype)
     data = ib.buffer_ptr(data)
-    Flag = ib.buffer_ptr(Flag)
+    flag = ib.buffer_ptr(flag)
     valid_count = ib.buffer_ptr(valid_count)
-    PrefixSum = ib.buffer_ptr(PrefixSum)
+    prefix_sum = ib.buffer_ptr(prefix_sum)
     out = ib.buffer_ptr(out)
 
     max_threads = int(tvm.target.Target.current(
@@ -226,9 +226,9 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         base_idx = i * num_anchors * elem_length
-        with ib.if_scope(tvm.all(Flag[tid] > 0, PrefixSum[tid] >= 0, PrefixSum[tid] < num_anchors)):
+        with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0, prefix_sum[tid] < num_anchors)):
             with ib.for_range(0, elem_length) as k:
-                out[base_idx + PrefixSum[tid] * elem_length +
+                out[base_idx + prefix_sum[tid] * elem_length +
                     k] = data[tid * elem_length + k]
         with ib.if_scope(j >= valid_count[i]):
             with ib.for_range(0, elem_length) as k:

From fd50a62fa6d03fc03439a8b10d4fc981fedef3f9 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Wed, 19 Feb 2020 19:20:37 +0000
Subject: [PATCH 09/10] fix lint

---
 topi/python/topi/cuda/nms.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index e4880005dabb..354cda916ec7 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -226,7 +226,8 @@ def out_rewrite(data, flag, prefix_sum, valid_count, out):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         base_idx = i * num_anchors * elem_length
-        with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0, prefix_sum[tid] < num_anchors)):
+        with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0,
+                                 prefix_sum[tid] < num_anchors)):
             with ib.for_range(0, elem_length) as k:
                 out[base_idx + prefix_sum[tid] * elem_length +
                     k] = data[tid * elem_length + k]

From 02d2eadd0035e8ae0f6ec345a8e651ff8fdccd0c Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Thu, 20 Feb 2020 20:15:44 +0000
Subject: [PATCH 10/10] correct atomicAdd identifier name

---
 topi/python/topi/cuda/nms.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 354cda916ec7..5485859de01f 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -29,11 +29,11 @@
 
 def cuda_atomic_add_rule(op):
     if op.dtype == "float32":
-        return tvm.call_pure_extern("float32", "atomic_add", op.args[0], op.args[1])
+        return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
     if op.dtype == "float64":
-        return tvm.call_pure_extern("float64", "atomic_add", op.args[0], op.args[1])
+        return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
     if op.dtype == "int32":
-        return tvm.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
+        return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
     raise RuntimeError("only support int32, float32 and float64")