From ed42e85e9a902ff721cc84a3802f3e4da73cc981 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Wed, 12 Feb 2020 21:57:48 +0000 Subject: [PATCH 01/10] get_valid_count accuracy issue fixed for individual tests but not for all tests running together --- tests/python/relay/test_op_level5.py | 18 +- topi/python/topi/cuda/nms.py | 434 +++++++++----------------- topi/tests/python/test_topi_vision.py | 30 +- 3 files changed, 177 insertions(+), 305 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 03e700b3df83..1436d46f4680 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -221,17 +221,15 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): func = relay.Function([x], z.astuple()) func = run_infer_type(func) for target, ctx in ctx_list(): - if target == 'cuda': - return intrp = relay.create_executor("debug", ctx=ctx, target=target) out = intrp.evaluate(func)(np_data) tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) verify_get_valid_counts((1, 2500, 6), 0, 0, 1) - verify_get_valid_counts((1, 2500, 5), -1, -1, 0) - verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) - verify_get_valid_counts((16, 500, 5), 0.95, -1, 0) + #verify_get_valid_counts((1, 2500, 5), -1, -1, 0) + #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) + #verify_get_valid_counts((16, 500, 5), 0.95, -1, 0) def test_non_max_suppression(): @@ -673,11 +671,11 @@ def verify_space_to_depth(dshape, block_size, layout): if __name__ == "__main__": - test_resize_infer_type() - test_resize() - test_crop_and_resize() - test_multibox_prior() - test_multibox_transform_loc() + #test_resize_infer_type() + #test_resize() + #test_crop_and_resize() + #test_multibox_prior() + #test_multibox_transform_loc() test_get_valid_counts() test_roi_align() test_roi_pool() diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 38f87a9523c8..da22dd0b0289 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -28,15 +28,34 @@ from .. import tag -def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index): - """Low level IR to Prepare get valid count of bounding boxes +def cuda_atomicAdd_rule(op): + if op.dtype == "float32": + return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1]) + elif op.dtype == "float64": + return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1]) + elif op.dtype == "int32": + return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) + else: + raise RuntimeError("only support int32, float32 and float64") + + +tvm.register_intrin_rule( + "cuda", "atomicAdd", cuda_atomicAdd_rule, override=True) + + +def atomicAdd(x, y): + return tvm.call_pure_intrin(y.dtype, "atomicAdd", x, y) + + +def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index, score_index): + """Low level IR to get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. Parameters ---------- - data: Buffer - 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. + data : Buffer + Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. @@ -44,14 +63,11 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index idx : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. - score_threshold : float32 - Lower limit of score for valid bounding boxes. - - id_index : optional, int - index of the class categories, -1 to disable. + valid_count : Buffer + 1-D buffer for valid number of boxes. - score_index: optional, int - Index of the scores/confidence of boxes. + out : Buffer + Rearranged data buffer. Returns ------- @@ -60,14 +76,20 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index """ batch_size = data.shape[0] num_anchors = data.shape[1] - box_data_length = data.shape[2] + elem_length = data.shape[2] ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) - flag = ib.buffer_ptr(flag) - idx = ib.buffer_ptr(idx) - score_threshold = tvm.make.node("FloatImm", dtype="float32", value=score_threshold) + + valid_count = ib.buffer_ptr(valid_count) + Flag = ib.buffer_ptr(Flag) + out = ib.buffer_ptr(out) + atomicAdd_return = ib.allocate( + valid_count.dtype, (1,), name='atomicAdd_return', scope='local') + one_count = tvm.const(1, dtype=valid_count.dtype) + score_threshold = tvm.make.node( + "FloatImm", dtype="float32", value=score_threshold) id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) score_index = tvm.make.node("IntImm", dtype="int32", value=score_index) @@ -79,163 +101,36 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx + idxd = tvm.indexdiv with ib.if_scope(tid < batch_size * num_anchors): - with ib.if_scope(tvm.all(data[tid * box_data_length + score_index] > score_threshold, \ - tvm.any(id_index < 0, data[tid * box_data_length + id_index] >= 0))): - flag[tid] = 1 - idx[tid] = 1 - with ib.else_scope(): - flag[tid] = 0 - idx[tid] = 0 - - return ib.get() - -def get_valid_counts_upsweep(data, idx_in, idx, partial): - """Low level IR of first step of scan: unsweep. - - Parameters - ---------- - data: Buffer - 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. - - idx_in : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. - - idx : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. - - partial : Buffer - 2D Buffer of valid data indices with shape [batch_size, new_range]. + i = idxd(tid, num_anchors) + Flag[tid] = 0 + base_idx = i * num_anchors * elem_length + with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold, + tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))): + Flag[tid] = 1 + with ib.for_range(0, elem_length) as k: + out[base_idx + k] = data[base_idx + k] + atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of", + valid_count[i]), one_count) - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch_size = data.shape[0] - num_anchors = data.shape[1] - ib = tvm.ir_builder.create() - data = ib.buffer_ptr(data) - idx_in = ib.buffer_ptr(idx_in) - idx = ib.buffer_ptr(idx) - partial = ib.buffer_ptr(partial) - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - elem_per_thread = num_anchors // max_threads + 1 - nthread_tx = max_threads - nthread_bx = batch_size - tx = tvm.thread_axis("threadIdx.x") - bx = tvm.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) - new_range = num_anchors // elem_per_thread + 1 - # Scan: Upsweep: - with ib.if_scope(tvm.all(bx < batch_size, tx < new_range)): - with ib.for_range(0, elem_per_thread) as i: - with ib.if_scope(bx * num_anchors + \ - tx * elem_per_thread + i < batch_size * num_anchors): - with ib.if_scope(i == 0): - partial[bx * new_range + tx] = idx_in[bx * num_anchors + tx * elem_per_thread] - idx[bx * num_anchors + tx * elem_per_thread] = \ - idx_in[bx * num_anchors + tx * elem_per_thread] - with ib.else_scope(): - partial[bx * new_range + tx] += \ - idx_in[bx * num_anchors + tx * elem_per_thread + i] - idx[bx * num_anchors + tx * elem_per_thread + i] = \ - idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \ - idx_in[bx * num_anchors + tx * elem_per_thread + i] - ib.emit(tvm.make.Call(None, 'tvm_storage_sync', - tvm.convert(['shared']), - tvm.expr.Call.Intrinsic, None, 0)) return ib.get() -def get_valid_counts_scan(data, partial_in, partial): - """Low level IR to do scan. - - Parameters - ---------- - data: Buffer - 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. - idx_in : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. +def flag_scan(Flag, PrefixSum): + batch_size = Flag.shape[0] + num_anchors = Flag.shape[1] - idx : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. - - partial : Buffer - 2D Buffer of valid data indices with shape [batch_size, new_range]. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch_size = data.shape[0] - num_anchors = data.shape[1] ib = tvm.ir_builder.create() - partial_in = ib.buffer_ptr(partial_in) - partial = ib.buffer_ptr(partial) - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - elem_per_thread = num_anchors // max_threads + 1 - nthread_tx = max_threads - nthread_bx = batch_size - tx = tvm.thread_axis("threadIdx.x") - bx = tvm.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) - var = tvm.make.node("FloatImm", dtype="float32", value=2) - new_range = num_anchors // elem_per_thread + 1 - iteration = cast(log(cast(new_range, "float32")) / math.log(2), "int32") - # Scan: Kogge-Stone adder - with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))): - with ib.for_range(0, iteration) as k: - with ib.if_scope(k == 0): - with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))): - partial[bx * new_range + tx] = \ - partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1] - with ib.else_scope(): - partial[bx * new_range] = partial_in[bx * new_range] - with ib.else_scope(): - with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \ - tx < tvm.min(new_range, num_anchors))): - partial[bx * new_range + tx] += \ - partial[bx * new_range + tx - cast(power(var, k), "int32")] - ib.emit(tvm.make.Call(None, 'tvm_storage_sync', - tvm.convert(['shared']), - tvm.expr.Call.Intrinsic, None, 0)) - return ib.get() -def get_valid_counts_downsweep(data, idx_in, partial, idx): - """Low level IR to do downsweep of scan. + Flag = ib.buffer_ptr(Flag) + PrefixSum = ib.buffer_ptr(PrefixSum) + atomicAdd_return = ib.allocate( + "int32", (1,), name='atomicAdd_return', scope='local') - Parameters - ---------- - data: Buffer - 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. - - idx_in : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. - - partial : Buffer - 2D Buffer of valid data indices with shape [batch_size, new_range]. - - idx : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch_size = data.shape[0] - num_anchors = data.shape[1] - ib = tvm.ir_builder.create() - idx_in = ib.buffer_ptr(idx_in) - idx = ib.buffer_ptr(idx) - partial = ib.buffer_ptr(partial) - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - elem_per_thread = num_anchors // max_threads + 1 + max_threads = int(tvm.target.current_target( + allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = batch_size * num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") @@ -243,69 +138,44 @@ def get_valid_counts_downsweep(data, idx_in, partial, idx): ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx - new_range = num_anchors // elem_per_thread + 1 idxd = tvm.indexdiv idxm = tvm.indexmod - # Scan: Downsweep: - with ib. if_scope(tid < batch_size * num_anchors): - i = idxd(tid, num_anchors) # number of batches - j = idxm(tid, num_anchors) # number of anchors - with ib.if_scope(j < elem_per_thread): - idx[tid] = idx_in[tid] - with ib.else_scope(): - idx[tid] = idx_in[tid] + partial[i * new_range + idxd(j, elem_per_thread) - 1] - - return ib.get() - -def get_valid_counts_ir(data, flag, idx, valid_count, out): - """Low level IR to get valid count of bounding boxes - given a score threshold. Also moves valid boxes to the - top of input data. - - Parameters - ---------- - data : Buffer - Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. - flag : Buffer - 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. + with ib.if_scope(tid < batch_size * num_anchors): + i = idxd(tid, num_anchors) + j = idxm(tid, num_anchors) + with ib.for_range(0, j) as r: + # TODO: no difference for the following two + PrefixSum[tid] += Flag[i * num_anchors + r] + # atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of", + # PrefixSum[tid]), Flag[i * num_anchors + r]) - idx : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. + return ib.get() - valid_count : Buffer - 1-D buffer for valid number of boxes. - out : Buffer - Rearranged data buffer. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch_size = data.shape[0] - num_anchors = data.shape[1] - elem_length = data.shape[2] - size = batch_size * num_anchors * elem_length +def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out): + batch_size = out.shape[0] + num_anchors = out.shape[1] + elem_length = out.shape[2] ib = tvm.ir_builder.create() + one = tvm.const(1, dtype=out.dtype) data = ib.buffer_ptr(data) - flag = ib.buffer_ptr(flag) - idx = ib.buffer_ptr(idx) + Flag = ib.buffer_ptr(Flag) valid_count = ib.buffer_ptr(valid_count) + PrefixSum = ib.buffer_ptr(PrefixSum) + out_in = ib.buffer_ptr(out_in) out = ib.buffer_ptr(out) max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) nthread_tx = max_threads - nthread_bx = batch_size * num_anchors * elem_length // max_threads + 1 + nthread_bx = batch_size * num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx - idxd = tvm.indexdiv idxm = tvm.indexmod @@ -313,17 +183,15 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) base_idx = i * num_anchors * elem_length - with ib.if_scope(flag[tid] > 0): + out[tid] = out_in[tid] + with ib.if_scope(tvm.all(Flag[tid] > 0, PrefixSum[tid] >= 0, PrefixSum[tid] < num_anchors)): + with ib.for_range(0, elem_length) as k: + out[base_idx + PrefixSum[tid] * elem_length + + k] = data[tid * elem_length + k] + with ib.if_scope(j >= valid_count[i]): with ib.for_range(0, elem_length) as k: - with ib.if_scope(base_idx + (idx[tid] - 1) * elem_length + k < size): - out[base_idx + (idx[tid] - 1) * elem_length + k] =\ - data[base_idx + j * elem_length + k] - with ib.if_scope(j == 0): - valid_count[i] = idx[tid + num_anchors - 1] - with ib.if_scope(j >= idx[i * num_anchors + num_anchors - 1]): - with ib.for_range(0, elem_length) as l: - with ib.if_scope(tid * elem_length + l < size): - out[tid * elem_length + l] = -1.0 + out[tid * elem_length + k] = -one + return ib.get() @@ -356,56 +224,46 @@ def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1): """ batch_size = data.shape[0] num_anchors = data.shape[1] - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - elem_per_thread = num_anchors // max_threads + 1 - new_range = num_anchors // elem_per_thread + 1 + data_buf = api.decl_buffer( + data.shape, data.dtype, "data_buf", data_alignment=8) + valid_count_buf = api.decl_buffer( + (batch_size,), "int32", "valid_count_buf", data_alignment=8) + temp_out_buf = api.decl_buffer( + data.shape, data.dtype, "temp_out_buf", data_alignment=8) temp_flag_buf = api.decl_buffer( (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8) - temp_idx_buf = api.decl_buffer( - (batch_size, num_anchors,), "int32", "temp_idx", data_alignment=8) temp_partial_buf = api.decl_buffer( - (batch_size, new_range), "int32", "temp_partial", data_alignment=8) - data_buf = api.decl_buffer( - data.shape, data.dtype, "data_buf", data_alignment=8) - - temp_flag, temp_idx = \ - tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data], - lambda ins, outs: get_valid_counts_pre( - ins[0], outs[0], outs[1], score_threshold, id_index, score_index), - dtype=["int32", "int32"], - out_buffers=[temp_flag_buf, temp_idx_buf], - name="get_valid_counts_phase_one") - temp_idx_new, temp_partial = \ - tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx], - lambda ins, outs: get_valid_counts_upsweep( - ins[0], ins[1], outs[0], outs[1]), - dtype=["int32", "int32"], - out_buffers=[temp_idx_buf, temp_partial_buf], - name="get_valid_counts_phase_two") - temp_partial_new = \ - tvm.extern([(batch_size, new_range)], [data, temp_partial], - lambda ins, outs: get_valid_counts_scan( - ins[0], ins[1], outs[0]), - dtype=["int32"], - out_buffers=[temp_partial_buf], - name="get_valid_counts_phase_three") - temp_idx_final = \ - tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new], - lambda ins, outs: get_valid_counts_downsweep( - ins[0], ins[1], ins[2], outs[0]), - dtype=["int32"], - out_buffers=[temp_idx_buf], - name="get_valid_counts_phase_four") - valid_count, out_tensor = \ - tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final], - lambda ins, outs: get_valid_counts_ir( - ins[0], ins[1], ins[2], outs[0], outs[1]), - dtype=["int32", data.dtype], - in_buffers=[data_buf, temp_flag_buf, temp_idx_buf], - name="get_valid_counts_phase_five", + (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8) + + valid_count, temp_flag, out_in = \ + tvm.extern([(batch_size,), (batch_size, num_anchors), data.shape], [data], + lambda ins, outs: get_valid_counts_ir( + ins[0], outs[0], outs[1], outs[2], score_threshold, id_index, score_index), + dtype=["int32", "int32", data.dtype], + in_buffers=[data_buf], + out_buffers=[valid_count_buf, temp_flag_buf, temp_out_buf], + name="get_valid_counts", tag="get_valid_counts_gpu") - return [valid_count, out_tensor] + temp_partial = \ + tvm.extern([(batch_size, num_anchors)], [temp_flag], + lambda ins, outs: flag_scan( + ins[0], outs[0]), + dtype=["int32"], + in_buffers=[temp_flag_buf], + out_buffers=[temp_partial_buf], + name="flag_scan") + + out = \ + tvm.extern([data.shape], [data, temp_flag, temp_partial, out_in, valid_count], + lambda ins, outs: out_rewrite( + ins[0], ins[1], ins[2], ins[3], ins[4], outs[0]), + dtype=[data.dtype], + in_buffers=[data_buf, temp_flag_buf, + temp_partial_buf, temp_out_buf, valid_count_buf], + name="out_rewrite") + + return [valid_count, out] def nms_ir(data, sorted_index, valid_count, out, box_indices, @@ -479,7 +337,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) - num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local") + num_valid_boxes = ib.allocate( + "int32", (1,), name="num_valid_boxes", scope="local") max_threads = int( tvm.target.Target.current(allow_none=False).max_num_threads) @@ -491,26 +350,29 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): ib.scope_attr(bx, "thread_extent", nthread_bx) j = bx * max_threads + tx - iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold) + iou_threshold = tvm.make.node( + "FloatImm", dtype="float32", value=iou_threshold) top_k = tvm.make.node("IntImm", dtype="int32", value=top_k) coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start) id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) score_index = tvm.make.node("IntImm", dtype="int32", value=score_index) - force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) + force_suppress = tvm.make.node( + "IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll") as i: base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output - nkeep = if_then_else( \ - tvm.all(top_k > 0, top_k < valid_count[i]), - top_k, valid_count[i]) + nkeep = if_then_else( + tvm.all(top_k > 0, top_k < valid_count[i]), + top_k, valid_count[i]) with ib.if_scope(j < nkeep): with ib.for_range(0, box_data_length) as k: out[(base_idx + j * box_data_length + k)] = \ - data[(base_idx + sorted_index[i * num_anchors + j] \ - * box_data_length + k)] - box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j] + data[(base_idx + sorted_index[i * num_anchors + j] + * box_data_length + k)] + box_indices[i * num_anchors + + j] = sorted_index[i * num_anchors + j] with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])): with ib.if_scope(j < valid_count[i] - nkeep): with ib.for_range(0, box_data_length) as k: @@ -519,16 +381,17 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): # Apply nms with ib.for_range(0, valid_count[i]) as k: offset_k = k * box_data_length - with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0, \ - tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))): + with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0, + tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length - with ib.if_scope(tvm.all(j > k, \ - out[base_idx + offset_j + score_index] > 0, \ - tvm.any(id_index < 0, \ - out[base_idx + offset_j + id_index] >= 0), \ - tvm.any(force_suppress > 0, id_index < 0, \ - out[base_idx + offset_k + id_index] == \ + with ib.if_scope(tvm.all(j > k, + out[base_idx + offset_j + + score_index] > 0, + tvm.any(id_index < 0, + out[base_idx + offset_j + id_index] >= 0), + tvm.any(force_suppress > 0, id_index < 0, + out[base_idx + offset_k + id_index] == out[base_idx + offset_j + id_index]))): iou = calculate_overlap(out, base_idx + offset_j + coord_start, base_idx + offset_k + coord_start) @@ -541,12 +404,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.for_range(0, box_data_length) as k: - out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k] + out[(base_idx + offset_j + k) + ] = data[base_idx + offset_j + k] box_indices[i * num_anchors + j] = j # Set invalid entry to be -1 with ib.if_scope(j < num_anchors - valid_count[i]): with ib.for_range(0, box_data_length) as k: - out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0 + out[base_idx + (j + valid_count[i]) * + box_data_length + k] = -1.0 box_indices[i * num_anchors + j + valid_count[i]] = -1 # Only return max_output_size number of valid boxes num_valid_boxes[0] = 0 @@ -671,7 +536,7 @@ def invalid_to_bottom_ir(data, flag, idx, out): with ib.if_scope(flag[i * num_anchors + j] > 0): with ib.for_range(0, elem_length) as k: out[base_idx + (idx[i * num_anchors + j] - 1) * elem_length + k] \ - = data[base_idx + j * elem_length + k] + = data[base_idx + j * elem_length + k] return ib.get() @@ -756,8 +621,10 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1, "valid_count_buf", data_alignment=4) score_axis = score_index score_shape = (batch_size, num_anchors) - score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE) - sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False) + score_tensor = tvm.compute( + score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE) + sort_tensor = argsort( + score_tensor, valid_count=valid_count, axis=1, is_ascend=False) sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8) @@ -795,7 +662,8 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1, ins[0], outs[0], outs[1]), dtype=["int32", "int32"], in_buffers=[out_buf], - out_buffers=[temp_flag_buf, temp_idx_buf], + out_buffers=[ + temp_flag_buf, temp_idx_buf], name="invalid_to_bottom_phase_one") output = tvm.extern([data.shape], [out, temp_flag, temp_idx], diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index a081f0797dad..4e3bce4aa94e 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -63,21 +63,27 @@ def check_device(device): tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx) f = tvm.build(s, [data, outs[0], outs[1]], device) f(tvm_input_data, tvm_out1, tvm_out2) + import sys + np.set_printoptions(threshold=sys.maxsize) + #print(tvm_out2.asnumpy()) + #print("====================================") + #print(np_out2) + #print("===============diff================") + #print(tvm_out2.asnumpy() - np_out2) tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3) tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3) - for device in ['llvm', 'cuda', 'opencl']: + for device in ['cuda', 'opencl']: # Disable gpu test for now - if device != "llvm": - continue check_device(device) def test_get_valid_counts(): - verify_get_valid_counts((1, 2500, 6), 0, 0, 1) - verify_get_valid_counts((1, 2500, 5), -1, -1, 0) - verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) - verify_get_valid_counts((16, 500, 5), 0.95, -1, 1) + verify_get_valid_counts((1, 122640, 6), 0.01, 0, 1) + #verify_get_valid_counts((1, 2500, 6), 0, 0, 1) + #verify_get_valid_counts((1, 2500, 5), -1, -1, 0) + #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) + #verify_get_valid_counts((16, 500, 5), 0.95, -1, 1) def verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, iou_threshold, @@ -424,8 +430,8 @@ def test_proposal(): if __name__ == "__main__": test_get_valid_counts() - test_non_max_suppression() - test_multibox_prior() - test_multibox_detection() - test_roi_align() - test_proposal() + #test_non_max_suppression() + #test_multibox_prior() + #test_multibox_detection() + #test_roi_align() + #test_proposal() From ffac02f8d20ecb54e054b9c52edc0ff89d02eb03 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Fri, 14 Feb 2020 21:52:46 +0000 Subject: [PATCH 02/10] minor fix --- topi/python/topi/cuda/nms.py | 48 ++++++++++++--------------- topi/tests/python/test_topi_vision.py | 2 +- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index da22dd0b0289..6ba97e87da0c 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -39,7 +39,7 @@ def cuda_atomicAdd_rule(op): raise RuntimeError("only support int32, float32 and float64") -tvm.register_intrin_rule( +tvm.target.intrin.register_intrin_rule( "cuda", "atomicAdd", cuda_atomicAdd_rule, override=True) @@ -47,7 +47,7 @@ def atomicAdd(x, y): return tvm.call_pure_intrin(y.dtype, "atomicAdd", x, y) -def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index, score_index): +def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, score_index): """Low level IR to get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. @@ -84,7 +84,6 @@ def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index, valid_count = ib.buffer_ptr(valid_count) Flag = ib.buffer_ptr(Flag) - out = ib.buffer_ptr(out) atomicAdd_return = ib.allocate( valid_count.dtype, (1,), name='atomicAdd_return', scope='local') one_count = tvm.const(1, dtype=valid_count.dtype) @@ -93,7 +92,8 @@ def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index, id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) score_index = tvm.make.node("IntImm", dtype="int32", value=score_index) - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + max_threads = int(tvm.target.Target.current( + allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = batch_size * num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") @@ -106,14 +106,13 @@ def get_valid_counts_ir(data, valid_count, Flag, out, score_threshold, id_index, with ib.if_scope(tid < batch_size * num_anchors): i = idxd(tid, num_anchors) Flag[tid] = 0 - base_idx = i * num_anchors * elem_length with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold, tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))): Flag[tid] = 1 - with ib.for_range(0, elem_length) as k: - out[base_idx + k] = data[base_idx + k] atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of", valid_count[i]), one_count) + with ib.else_scope(): + Flag[tid] = 0 return ib.get() @@ -126,10 +125,8 @@ def flag_scan(Flag, PrefixSum): Flag = ib.buffer_ptr(Flag) PrefixSum = ib.buffer_ptr(PrefixSum) - atomicAdd_return = ib.allocate( - "int32", (1,), name='atomicAdd_return', scope='local') - max_threads = int(tvm.target.current_target( + max_threads = int(tvm.target.Target.current( allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = batch_size * num_anchors // max_threads + 1 @@ -145,15 +142,12 @@ def flag_scan(Flag, PrefixSum): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) with ib.for_range(0, j) as r: - # TODO: no difference for the following two PrefixSum[tid] += Flag[i * num_anchors + r] - # atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of", - # PrefixSum[tid]), Flag[i * num_anchors + r]) return ib.get() -def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out): +def out_rewrite(data, Flag, PrefixSum, valid_count, out): batch_size = out.shape[0] num_anchors = out.shape[1] elem_length = out.shape[2] @@ -165,10 +159,10 @@ def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out): Flag = ib.buffer_ptr(Flag) valid_count = ib.buffer_ptr(valid_count) PrefixSum = ib.buffer_ptr(PrefixSum) - out_in = ib.buffer_ptr(out_in) out = ib.buffer_ptr(out) - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + max_threads = int(tvm.target.Target.current( + allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = batch_size * num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") @@ -183,7 +177,6 @@ def out_rewrite(data, Flag, PrefixSum, out_in, valid_count, out): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) base_idx = i * num_anchors * elem_length - out[tid] = out_in[tid] with ib.if_scope(tvm.all(Flag[tid] > 0, PrefixSum[tid] >= 0, PrefixSum[tid] < num_anchors)): with ib.for_range(0, elem_length) as k: out[base_idx + PrefixSum[tid] * elem_length + @@ -228,20 +221,20 @@ def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1): data.shape, data.dtype, "data_buf", data_alignment=8) valid_count_buf = api.decl_buffer( (batch_size,), "int32", "valid_count_buf", data_alignment=8) - temp_out_buf = api.decl_buffer( - data.shape, data.dtype, "temp_out_buf", data_alignment=8) temp_flag_buf = api.decl_buffer( (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8) temp_partial_buf = api.decl_buffer( (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8) + out_buf = api.decl_buffer( + data.shape, data.dtype, "out_buf", data_alignment=8) - valid_count, temp_flag, out_in = \ - tvm.extern([(batch_size,), (batch_size, num_anchors), data.shape], [data], + valid_count, temp_flag = \ + tvm.extern([(batch_size,), (batch_size, num_anchors)], [data], lambda ins, outs: get_valid_counts_ir( - ins[0], outs[0], outs[1], outs[2], score_threshold, id_index, score_index), - dtype=["int32", "int32", data.dtype], + ins[0], outs[0], outs[1], score_threshold, id_index, score_index), + dtype=["int32", "int32"], in_buffers=[data_buf], - out_buffers=[valid_count_buf, temp_flag_buf, temp_out_buf], + out_buffers=[valid_count_buf, temp_flag_buf], name="get_valid_counts", tag="get_valid_counts_gpu") @@ -255,12 +248,13 @@ def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1): name="flag_scan") out = \ - tvm.extern([data.shape], [data, temp_flag, temp_partial, out_in, valid_count], + tvm.extern([data.shape], [data, temp_flag, temp_partial, valid_count], lambda ins, outs: out_rewrite( - ins[0], ins[1], ins[2], ins[3], ins[4], outs[0]), + ins[0], ins[1], ins[2], ins[3], outs[0]), dtype=[data.dtype], in_buffers=[data_buf, temp_flag_buf, - temp_partial_buf, temp_out_buf, valid_count_buf], + temp_partial_buf, valid_count_buf], + out_buffers=[out_buf], name="out_rewrite") return [valid_count, out] diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 4e3bce4aa94e..0207de801648 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -80,7 +80,7 @@ def check_device(device): def test_get_valid_counts(): verify_get_valid_counts((1, 122640, 6), 0.01, 0, 1) - #verify_get_valid_counts((1, 2500, 6), 0, 0, 1) + verify_get_valid_counts((1, 125000, 6), 0, 0, 1) #verify_get_valid_counts((1, 2500, 5), -1, -1, 0) #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) #verify_get_valid_counts((16, 500, 5), 0.95, -1, 1) From e1dfe59b9d7a5998ccd9a1c20f0bbd735b17f4d9 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 17 Feb 2020 22:46:21 +0000 Subject: [PATCH 03/10] initialize valid_count and PrefixSum buffers --- topi/python/topi/cuda/nms.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 6ba97e87da0c..46ad335e04cc 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -103,6 +103,8 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor tid = bx * max_threads + tx idxd = tvm.indexdiv + with ib.if_scope(tid < batch_size): + valid_count[tid] = 0 with ib.if_scope(tid < batch_size * num_anchors): i = idxd(tid, num_anchors) Flag[tid] = 0 @@ -111,8 +113,6 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor Flag[tid] = 1 atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of", valid_count[i]), one_count) - with ib.else_scope(): - Flag[tid] = 0 return ib.get() @@ -138,6 +138,8 @@ def flag_scan(Flag, PrefixSum): idxd = tvm.indexdiv idxm = tvm.indexmod + with ib.if_scope(tid < batch_size * num_anchors): + PrefixSum[tid] = 0 with ib.if_scope(tid < batch_size * num_anchors): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) From e1e8d6f946daec399f578fbf7d96c7abeac0cefc Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 17 Feb 2020 23:25:11 +0000 Subject: [PATCH 04/10] test updated --- topi/python/topi/cuda/nms.py | 6 ++++- topi/tests/python/test_topi_vision.py | 32 +++++++++++---------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 46ad335e04cc..67a7e9458b60 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -103,11 +103,14 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor tid = bx * max_threads + tx idxd = tvm.indexdiv + # initialize valid_count with ib.if_scope(tid < batch_size): valid_count[tid] = 0 + # initialize Flag with ib.if_scope(tid < batch_size * num_anchors): - i = idxd(tid, num_anchors) Flag[tid] = 0 + with ib.if_scope(tid < batch_size * num_anchors): + i = idxd(tid, num_anchors) with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold, tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))): Flag[tid] = 1 @@ -138,6 +141,7 @@ def flag_scan(Flag, PrefixSum): idxd = tvm.indexdiv idxm = tvm.indexmod + # initialize PrefixSum with ib.if_scope(tid < batch_size * num_anchors): PrefixSum[tid] = 0 with ib.if_scope(tid < batch_size * num_anchors): diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 0207de801648..85e4180a0892 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -63,27 +63,21 @@ def check_device(device): tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx) f = tvm.build(s, [data, outs[0], outs[1]], device) f(tvm_input_data, tvm_out1, tvm_out2) - import sys - np.set_printoptions(threshold=sys.maxsize) - #print(tvm_out2.asnumpy()) - #print("====================================") - #print(np_out2) - #print("===============diff================") - #print(tvm_out2.asnumpy() - np_out2) tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3) tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3) - for device in ['cuda', 'opencl']: - # Disable gpu test for now + for device in ['llvm', 'cuda', 'opencl']: + # Disable opencl test for now + if device != "llvm" and device != "cuda": + continue check_device(device) def test_get_valid_counts(): - verify_get_valid_counts((1, 122640, 6), 0.01, 0, 1) - verify_get_valid_counts((1, 125000, 6), 0, 0, 1) - #verify_get_valid_counts((1, 2500, 5), -1, -1, 0) - #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) - #verify_get_valid_counts((16, 500, 5), 0.95, -1, 1) + verify_get_valid_counts((1, 2500, 6), 0, 0, 1) + verify_get_valid_counts((1, 2500, 5), -1, -1, 0) + verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) + verify_get_valid_counts((16, 500, 5), 0.95, -1, 1) def verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, iou_threshold, @@ -430,8 +424,8 @@ def test_proposal(): if __name__ == "__main__": test_get_valid_counts() - #test_non_max_suppression() - #test_multibox_prior() - #test_multibox_detection() - #test_roi_align() - #test_proposal() + test_non_max_suppression() + test_multibox_prior() + test_multibox_detection() + test_roi_align() + test_proposal() From a0795d3e010762ec6f670e93eee868cc3a9b0907 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 17 Feb 2020 23:30:52 +0000 Subject: [PATCH 05/10] udpate relay test as well --- tests/python/relay/test_op_level5.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 1436d46f4680..e622a8ae01ab 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -227,9 +227,9 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) verify_get_valid_counts((1, 2500, 6), 0, 0, 1) - #verify_get_valid_counts((1, 2500, 5), -1, -1, 0) - #verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) - #verify_get_valid_counts((16, 500, 5), 0.95, -1, 0) + verify_get_valid_counts((1, 2500, 5), -1, -1, 0) + verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0) + verify_get_valid_counts((16, 500, 5), 0.95, -1, 0) def test_non_max_suppression(): @@ -671,11 +671,11 @@ def verify_space_to_depth(dshape, block_size, layout): if __name__ == "__main__": - #test_resize_infer_type() - #test_resize() - #test_crop_and_resize() - #test_multibox_prior() - #test_multibox_transform_loc() + test_resize_infer_type() + test_resize() + test_crop_and_resize() + test_multibox_prior() + test_multibox_transform_loc() test_get_valid_counts() test_roi_align() test_roi_pool() From 48738bd79db42f4e392dbd229fcd7c79ecce2e45 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 17 Feb 2020 23:38:39 +0000 Subject: [PATCH 06/10] update document --- topi/python/topi/cuda/nms.py | 59 +++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 67a7e9458b60..0ab3cfd7b76f 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -49,7 +49,7 @@ def atomicAdd(x, y): def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, score_index): """Low level IR to get valid count of bounding boxes - given a score threshold. Also moves valid boxes to the + given a score threshold. Also prepares to move valid boxes to the top of input data. Parameters @@ -57,17 +57,20 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor data : Buffer Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. - flag : Buffer + valid_count : Buffer + 1D buffer for valid number of boxes with shape [batch_size, ]. + + Flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. - idx : Buffer - 2D Buffer of valid data indices with shape [batch_size, num_anchors]. + score_threshold : float32 + Lower limit of score for valid bounding boxes. - valid_count : Buffer - 1-D buffer for valid number of boxes. + id_index : optional, int + index of the class categories, -1 to disable. - out : Buffer - Rearranged data buffer. + score_index: optional, int + Index of the scores/confidence of boxes. Returns ------- @@ -121,6 +124,21 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor def flag_scan(Flag, PrefixSum): + """Low level IR to calculate correct positions for valid boxes. + + Parameters + ---------- + Flag : Buffer + 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. + + PrefixSum : Buffer + 2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ batch_size = Flag.shape[0] num_anchors = Flag.shape[1] @@ -154,6 +172,31 @@ def flag_scan(Flag, PrefixSum): def out_rewrite(data, Flag, PrefixSum, valid_count, out): + """Low level IR to move valid boxes to the + top of input data. + + Parameters + ---------- + data : Buffer + Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. + + Flag : Buffer + 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. + + PrefixSum : Buffer + 2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag. + + valid_count : Buffer + 1D buffer for valid number of boxes with shape [batch_size, ]. + + out : Buffer + Rearranged data buffer. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ batch_size = out.shape[0] num_anchors = out.shape[1] elem_length = out.shape[2] From 94e87f5ed88c63bcffecab9fded365244b0925c2 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Tue, 18 Feb 2020 00:46:51 +0000 Subject: [PATCH 07/10] fix lint --- topi/python/topi/cuda/nms.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 0ab3cfd7b76f..bc5227165eac 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -21,8 +21,7 @@ import tvm from tvm import api -from tvm.generic import cast -from tvm.intrin import if_then_else, log, power +from tvm.intrin import if_then_else from topi.vision import non_max_suppression, get_valid_counts from .sort import argsort from .. import tag @@ -31,12 +30,11 @@ def cuda_atomicAdd_rule(op): if op.dtype == "float32": return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1]) - elif op.dtype == "float64": + if op.dtype == "float64": return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1]) - elif op.dtype == "int32": + if op.dtype == "int32": return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) - else: - raise RuntimeError("only support int32, float32 and float64") + raise RuntimeError("only support int32, float32 and float64") tvm.target.intrin.register_intrin_rule( @@ -67,7 +65,7 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor Lower limit of score for valid bounding boxes. id_index : optional, int - index of the class categories, -1 to disable. + index of the class categories, -1 to disable. score_index: optional, int Index of the scores/confidence of boxes. @@ -132,7 +130,8 @@ def flag_scan(Flag, PrefixSum): 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. PrefixSum : Buffer - 2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag. + 2D Buffer of prefix sum of flags indicating new locations of valid boxes + with same shape as Flag. Returns ------- @@ -184,7 +183,8 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out): 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. PrefixSum : Buffer - 2D Buffer of prefix sum of flags indicating new locations of valid boxes with same shape as Flag. + 2D Buffer of prefix sum of flags indicating new locations of valid boxes + with same shape as Flag. valid_count : Buffer 1D buffer for valid number of boxes with shape [batch_size, ]. @@ -425,7 +425,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): with ib.for_range(0, valid_count[i]) as k: offset_k = k * box_data_length with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0, - tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))): + tvm.any(id_index < 0, out[base_idx + + offset_k + id_index] >= 0))): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.if_scope(tvm.all(j > k, From 03d4cc797a409f43872b3d29a5a2a982d8046d7b Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Wed, 19 Feb 2020 02:47:42 +0000 Subject: [PATCH 08/10] address comment --- topi/python/topi/cuda/nms.py | 70 ++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index bc5227165eac..e4880005dabb 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -27,25 +27,25 @@ from .. import tag -def cuda_atomicAdd_rule(op): +def cuda_atomic_add_rule(op): if op.dtype == "float32": - return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1]) + return tvm.call_pure_extern("float32", "atomic_add", op.args[0], op.args[1]) if op.dtype == "float64": - return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1]) + return tvm.call_pure_extern("float64", "atomic_add", op.args[0], op.args[1]) if op.dtype == "int32": - return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) + return tvm.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) raise RuntimeError("only support int32, float32 and float64") tvm.target.intrin.register_intrin_rule( - "cuda", "atomicAdd", cuda_atomicAdd_rule, override=True) + "cuda", "atomic_add", cuda_atomic_add_rule, override=True) -def atomicAdd(x, y): - return tvm.call_pure_intrin(y.dtype, "atomicAdd", x, y) +def atomic_add(x, y): + return tvm.call_pure_intrin(y.dtype, "atomic_add", x, y) -def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, score_index): +def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, score_index): """Low level IR to get valid count of bounding boxes given a score threshold. Also prepares to move valid boxes to the top of input data. @@ -58,7 +58,7 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor valid_count : Buffer 1D buffer for valid number of boxes with shape [batch_size, ]. - Flag : Buffer + flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. score_threshold : float32 @@ -84,9 +84,9 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor data = ib.buffer_ptr(data) valid_count = ib.buffer_ptr(valid_count) - Flag = ib.buffer_ptr(Flag) - atomicAdd_return = ib.allocate( - valid_count.dtype, (1,), name='atomicAdd_return', scope='local') + flag = ib.buffer_ptr(flag) + atomic_add_return = ib.allocate( + valid_count.dtype, (1,), name='atomic_add_return', scope='local') one_count = tvm.const(1, dtype=valid_count.dtype) score_threshold = tvm.make.node( "FloatImm", dtype="float32", value=score_threshold) @@ -107,44 +107,44 @@ def get_valid_counts_ir(data, valid_count, Flag, score_threshold, id_index, scor # initialize valid_count with ib.if_scope(tid < batch_size): valid_count[tid] = 0 - # initialize Flag + # initialize flag with ib.if_scope(tid < batch_size * num_anchors): - Flag[tid] = 0 + flag[tid] = 0 with ib.if_scope(tid < batch_size * num_anchors): i = idxd(tid, num_anchors) with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold, tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))): - Flag[tid] = 1 - atomicAdd_return[0] = atomicAdd(tvm.call_pure_intrin("handle", "tvm_address_of", + flag[tid] = 1 + atomic_add_return[0] = atomic_add(tvm.call_pure_intrin("handle", "tvm_address_of", valid_count[i]), one_count) return ib.get() -def flag_scan(Flag, PrefixSum): +def flag_scan(flag, prefix_sum): """Low level IR to calculate correct positions for valid boxes. Parameters ---------- - Flag : Buffer + flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. - PrefixSum : Buffer + prefix_sum : Buffer 2D Buffer of prefix sum of flags indicating new locations of valid boxes - with same shape as Flag. + with same shape as flag. Returns ------- stmt : Stmt The result IR statement. """ - batch_size = Flag.shape[0] - num_anchors = Flag.shape[1] + batch_size = flag.shape[0] + num_anchors = flag.shape[1] ib = tvm.ir_builder.create() - Flag = ib.buffer_ptr(Flag) - PrefixSum = ib.buffer_ptr(PrefixSum) + flag = ib.buffer_ptr(flag) + prefix_sum = ib.buffer_ptr(prefix_sum) max_threads = int(tvm.target.Target.current( allow_none=False).max_num_threads) @@ -158,19 +158,19 @@ def flag_scan(Flag, PrefixSum): idxd = tvm.indexdiv idxm = tvm.indexmod - # initialize PrefixSum + # initialize prefix_sum with ib.if_scope(tid < batch_size * num_anchors): - PrefixSum[tid] = 0 + prefix_sum[tid] = 0 with ib.if_scope(tid < batch_size * num_anchors): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) with ib.for_range(0, j) as r: - PrefixSum[tid] += Flag[i * num_anchors + r] + prefix_sum[tid] += flag[i * num_anchors + r] return ib.get() -def out_rewrite(data, Flag, PrefixSum, valid_count, out): +def out_rewrite(data, flag, prefix_sum, valid_count, out): """Low level IR to move valid boxes to the top of input data. @@ -179,12 +179,12 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out): data : Buffer Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. - Flag : Buffer + flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. - PrefixSum : Buffer + prefix_sum : Buffer 2D Buffer of prefix sum of flags indicating new locations of valid boxes - with same shape as Flag. + with same shape as flag. valid_count : Buffer 1D buffer for valid number of boxes with shape [batch_size, ]. @@ -205,9 +205,9 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out): one = tvm.const(1, dtype=out.dtype) data = ib.buffer_ptr(data) - Flag = ib.buffer_ptr(Flag) + flag = ib.buffer_ptr(flag) valid_count = ib.buffer_ptr(valid_count) - PrefixSum = ib.buffer_ptr(PrefixSum) + prefix_sum = ib.buffer_ptr(prefix_sum) out = ib.buffer_ptr(out) max_threads = int(tvm.target.Target.current( @@ -226,9 +226,9 @@ def out_rewrite(data, Flag, PrefixSum, valid_count, out): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) base_idx = i * num_anchors * elem_length - with ib.if_scope(tvm.all(Flag[tid] > 0, PrefixSum[tid] >= 0, PrefixSum[tid] < num_anchors)): + with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0, prefix_sum[tid] < num_anchors)): with ib.for_range(0, elem_length) as k: - out[base_idx + PrefixSum[tid] * elem_length + + out[base_idx + prefix_sum[tid] * elem_length + k] = data[tid * elem_length + k] with ib.if_scope(j >= valid_count[i]): with ib.for_range(0, elem_length) as k: From fd50a62fa6d03fc03439a8b10d4fc981fedef3f9 Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Wed, 19 Feb 2020 19:20:37 +0000 Subject: [PATCH 09/10] fix lint --- topi/python/topi/cuda/nms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index e4880005dabb..354cda916ec7 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -226,7 +226,8 @@ def out_rewrite(data, flag, prefix_sum, valid_count, out): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) base_idx = i * num_anchors * elem_length - with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0, prefix_sum[tid] < num_anchors)): + with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0, + prefix_sum[tid] < num_anchors)): with ib.for_range(0, elem_length) as k: out[base_idx + prefix_sum[tid] * elem_length + k] = data[tid * elem_length + k] From 02d2eadd0035e8ae0f6ec345a8e651ff8fdccd0c Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Thu, 20 Feb 2020 20:15:44 +0000 Subject: [PATCH 10/10] correct atomicAdd identifier name --- topi/python/topi/cuda/nms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 354cda916ec7..5485859de01f 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -29,11 +29,11 @@ def cuda_atomic_add_rule(op): if op.dtype == "float32": - return tvm.call_pure_extern("float32", "atomic_add", op.args[0], op.args[1]) + return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1]) if op.dtype == "float64": - return tvm.call_pure_extern("float64", "atomic_add", op.args[0], op.args[1]) + return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1]) if op.dtype == "int32": - return tvm.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1]) + return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1]) raise RuntimeError("only support int32, float32 and float64")