From 68fb2f646bbdafb7821fdbe845acf60ea88098ed Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 27 Aug 2020 21:39:01 +0000
Subject: [PATCH 01/13] argwhere

---
 python/tvm/relay/op/strategy/cuda.py          |  6 ++
 python/tvm/topi/cuda/__init__.py              |  1 +
 python/tvm/topi/cuda/argwhere.py              | 54 +++++++++++++
 tests/python/relay/test_any.py                |  9 +--
 .../python/topi/python/test_topi_argwhere.py  | 76 +++++++++++++++++++
 5 files changed, 138 insertions(+), 8 deletions(-)
 create mode 100644 python/tvm/topi/cuda/argwhere.py
 create mode 100644 tests/python/topi/python/test_topi_argwhere.py

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 029690680e7d..f7fde9a9ec44 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -921,3 +921,9 @@ def correlation_strategy_cuda(attrs, inputs, out_type, target):
         name="correlation.cuda",
     )
     return strategy
+
+@schedule_argwhere.register(["cuda", "gpu"])
+def schedule_argwhere_cuda(attrs, outs, target):
+    """schedule argwhere for cuda"""
+    with target:
+        return topi.cuda.schedule_argwhere(outs)
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index 3ff544f4bb3e..23c625ae7ff7 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -54,3 +54,4 @@
 from .conv2d_hwnc_tensorcore import *
 from .correlation import *
 from .sparse import *
+from .argwhere import *
diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
new file mode 100644
index 000000000000..6505d8359cb7
--- /dev/null
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=too-many-arguments
+"""Argwhere operator"""
+
+import tvm
+from tvm import te
+from ..util import traverse_inline
+
+def schedule_argwhere(outs):
+    """Schedule for argwhere on cuda.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of argwhere
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for argwhere
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    def _schedule_argwhere(op):
+        if op in sch.outputs:
+            out = op
+        else:
+            out = outs[0].op.output(0)
+        fused = sch[out].fuse(*sch[out].op.axis)
+        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
+        bx, tx = sch[out].split(fused, factor=num_thread)
+        sch[out].bind(bx, te.thread_axis("blockIdx.x"))
+        sch[out].bind(tx, te.thread_axis("threadIdx.x"))
+
+    traverse_inline(sch, outs[0].op, _schedule_argwhere)
+
+    return sch
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index ee67e67b282f..3cd41d9e1961 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -219,14 +219,7 @@ def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"):
     mod["main"] = relay.Function([x], y)
     data = np.random.choice([0, 1, 2, 3], size=x_np_shape).astype(dtype)
     expected = np.argwhere(data)
-    for kind in ["debug", "vm"]:
-        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
-        result = ex.evaluate()(data).asnumpy()
-        assert result.shape == expected.shape
-        tvm.testing.assert_allclose(result.flatten(), expected.flatten())
-
-    # TODO(@zhiics) argwhere gpu schedule is currently not avaiable
-    # check_result([data], mod, expected, flatten=True)
+    check_result([data], mod, expected, flatten=True)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
new file mode 100644
index 000000000000..d8e189588421
--- /dev/null
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test for argwhere operator"""
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm import topi
+import tvm.topi.testing
+
+_argwhere_schedule = {
+    "generic": topi.generic.schedule_argwhere,
+    "gpu": topi.cuda.schedule_argwhere,
+}
+
+def verify_argwhere(data_shape):
+    dtype = "int32"
+    np_data = np.random.randint(5, size=data_shape).astype(dtype)
+    np_out = np.argwhere(np_data > 0)
+    out_shape = np_out.shape[0]
+    np_shape = np.ones(shape=(out_shape, len(data_shape)), dtype=dtype)
+
+    out_shape = te.placeholder(shape=(out_shape, len(data_shape)),
+                               name="out_shape", dtype=dtype)
+    condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
+    out = topi.argwhere(out_shape, condition)
+
+    def check_device(device, ctx):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+
+        with tvm.target.create(device):
+            s_func = tvm.topi.testing.dispatch(device, _argwhere_schedule)
+            sch = s_func(out)
+
+        func = tvm.build(sch, [out_shape, condition, out], device,
+                         name="argwhere")
+
+        args = [tvm.nd.array(np_shape, ctx)]
+        args.append(tvm.nd.array(np_data, ctx))
+        args.append(tvm.nd.empty(out.shape, ctx=ctx, dtype=condition.dtype))
+        func(*args)
+        tvm.testing.assert_allclose(args[-1].asnumpy(), np.array(np_out))
+
+    for target, ctx in tvm.testing.enabled_targets():
+        check_device(target, ctx)
+
+
+@tvm.testing.uses_gpu
+def test_argwhere():
+    verify_argwhere((1,))
+    verify_argwhere((5,))
+    verify_argwhere((5, 3))
+    verify_argwhere((6, 5, 3))
+    verify_argwhere((6, 4, 5, 3))
+    verify_argwhere((6, 4, 5, 3, 7))
+
+
+if __name__ == "__main__":
+    test_argwhere()

From d91598b8d43c6173622cc14a1e4a8077a4734aae Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Thu, 10 Sep 2020 23:38:24 +0000
Subject: [PATCH 02/13] cuda schedule

---
 3rdparty/vta-hw                               |   2 +-
 python/tvm/relay/op/_transform.py             |  16 +-
 python/tvm/relay/op/strategy/cuda.py          |  14 +-
 python/tvm/relay/op/strategy/generic.py       |  33 +-
 python/tvm/topi/argwhere.py                   |   2 +
 python/tvm/topi/cuda/argwhere.py              | 516 +++++++++++++++++-
 .../python/topi/python/test_topi_argwhere.py  |  18 +-
 7 files changed, 550 insertions(+), 51 deletions(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 12fb486a491b..87ce9acfae55 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 12fb486a491b75d70ec4c5e0a0cd112ab49a95bc
+Subproject commit 87ce9acfae550d1a487746e9d06c2e250076e54c
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 38d27e3a6833..05ca6d2e4bb9 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -83,21 +83,7 @@ def compute_strided_set(attrs, inputs, output_type):
 _reg.register_pattern("auto_scheduler_layout_transform", OpPattern.INJECTIVE)
 
 # argwhere
-@_reg.register_compute("argwhere")
-def compute_argwhere(attrs, inputs, output_type):
-    """Compute definition of argwhere"""
-    output_shape = []
-    for s in output_type.shape:
-        if hasattr(s, "value"):
-            output_shape.append(s)
-        else:
-            # see Any, replace it with a var
-            output_shape.append(te.var("any_dim", "int32"))
-    new_output_type = tvm.relay.ty.TensorType(output_shape, "int32")
-    return [topi.argwhere(new_output_type, inputs[0])]
-
-
-_reg.register_schedule("argwhere", strategy.schedule_argwhere)
+_reg.register_strategy("argwhere", strategy.argwhere_strategy)
 
 # scatter
 @_reg.register_compute("scatter")
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index f7fde9a9ec44..3a935d3288cf 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -922,8 +922,12 @@ def correlation_strategy_cuda(attrs, inputs, out_type, target):
     )
     return strategy
 
-@schedule_argwhere.register(["cuda", "gpu"])
-def schedule_argwhere_cuda(attrs, outs, target):
-    """schedule argwhere for cuda"""
-    with target:
-        return topi.cuda.schedule_argwhere(outs)
+@argwhere_strategy.register(["cuda", "gpu"])
+def argwhere_strategy_cuda(attrs, inputs, out_type, target):
+    """argwhere cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_argwhere(topi.cuda.argwhere),
+        wrap_topi_schedule(topi.cuda.schedule_argwhere),
+        name="argwhere.cuda")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index a03c5177914b..f90ed6e1a220 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -19,7 +19,7 @@
 import logging
 
 import re
-from tvm import topi, _ffi
+from tvm import topi, _ffi, te, ir
 from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple
 from tvm.target import generic_func, override_native_generic_func
 from .. import op as _op
@@ -1034,14 +1034,6 @@ def proposal_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
-# argwhere
-@generic_func
-def schedule_argwhere(attrs, outs, target):
-    """schedule argwhere"""
-    with target:
-        return topi.generic.schedule_argwhere(outs)
-
-
 # scatter
 @override_native_generic_func("scatter_strategy")
 def scatter_strategy(attrs, outs, out_type, target):
@@ -1223,3 +1215,26 @@ def correlation_strategy(attrs, inputs, out_type, target):
         name="correlation.generic",
     )
     return strategy
+
+# argwhere
+def wrap_compute_argwhere(topi_compute):
+    """wrap argwhere topi compute"""
+    def _compute_argwhere(attrs, inputs, out_type):
+        output_shape = []
+        for s in out_type.shape:
+            if hasattr(s, "value"):
+                output_shape.append(s)
+            else:
+                output_shape.append(te.var("any_dim", "int32"))
+        new_output_type = ir.TensorType(output_shape, "int32")
+        return [topi_compute(new_output_type, inputs[0])]
+    return _compute_argwhere
+
+@override_native_generic_func("argwhere_strategy")
+def argwhere_strategy(attrs, inputs, out_type, target):
+    """argwhere generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_argwhere(topi.argwhere),
+                                wrap_topi_schedule(topi.generic.schedule_argwhere),
+                                name="argwhere.generic")
+    return strategy
diff --git a/python/tvm/topi/argwhere.py b/python/tvm/topi/argwhere.py
index 75c19af35e5c..c2b658a4e92f 100644
--- a/python/tvm/topi/argwhere.py
+++ b/python/tvm/topi/argwhere.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
 """Argwhere operator"""
+import tvm
 from tvm.te import hybrid
 
 
@@ -169,6 +170,7 @@ def hybrid_argwhere_5d(output_shape, condition):
     return a
 
 
+@tvm.target.generic_func
 def argwhere(output_shape, condition):
     """Find the indices of elements of a tensor that are non-zero.
 
diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 6505d8359cb7..b8e346794ad7 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -14,12 +14,511 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=too-many-arguments
+# pylint: disable=too-many-arguments, invalid-name
 """Argwhere operator"""
 
 import tvm
 from tvm import te
-from ..util import traverse_inline
+from .nms import atomic_add
+
+
+def argwhere_1d_ir(condition, out):
+    """Low level IR for argwhere 1D
+
+    Parameters
+    ----------
+    condition : Buffer
+        The condition buffer.
+
+    out : Buffer
+        The output buffer.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.tir.ir_builder.create()
+    a0 = condition.shape[0]
+
+    condition = ib.buffer_ptr(condition)
+    out = ib.buffer_ptr(out)
+
+    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
+    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
+    one_count = tvm.tir.const(1, dtype="int32")
+
+    max_threads = 1024
+    nthread_tx = max_threads
+    nthread_bx = a0 // max_threads + 1
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    valid_index[0] = 0
+
+    with ib.if_scope(tid < a0):
+        with ib.if_scope(condition[tid] != 0):
+            tmp[0] = atomic_add(
+                tvm.tir.call_intrin(
+                    "handle", "tir.address_of", valid_index[0]
+                ),
+                one_count,
+            )
+            out[tmp[0]] = tid
+
+    return ib.get()
+
+
+def argwhere_1d(output_shape, condition):
+    """Compute for argwhere 1D
+
+    Parameters
+    ----------
+    condition : list of int or tvm.tir.Any
+        The output shape
+
+    out : tvm.te.Tensor
+        Tensor with boolean values.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    condition_buf = tvm.tir.decl_buffer(
+        condition.shape, condition.dtype, "data_buf", data_alignment=8
+    )
+    out_buf = tvm.tir.decl_buffer(
+        output_shape, "int32", "out_buf", data_alignment=8
+    )
+
+    out = te.extern(
+        [output_shape],
+        [condition],
+        lambda ins, outs: argwhere_1d_ir(ins[0], outs[0]),
+        dtype=["int32"],
+        in_buffers=[condition_buf],
+        out_buffers=[out_buf],
+        name="argwhere_1d",
+        tag="argwhere1d_gpu",
+    )
+
+    return out
+
+
+def argwhere_2d_ir(condition, out):
+    """Low level IR for argwhere 2D
+
+    Parameters
+    ----------
+    condition : Buffer
+        The condition buffer.
+
+    out : Buffer
+        The output buffer.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.tir.ir_builder.create()
+    a0 = condition.shape[0]
+    a1 = condition.shape[1]
+
+    condition = ib.buffer_ptr(condition)
+    out = ib.buffer_ptr(out)
+
+    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
+    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
+    one_count = tvm.tir.const(1, dtype="int32")
+
+    max_threads = 1024
+    nthread_tx = max_threads
+    nthread_bx = (a0 * a1) // max_threads + 1
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    valid_index[0] = 0
+
+    with ib.if_scope(tid < (a0 * a1)):
+        with ib.if_scope(condition[tid] != 0):
+            tmp[0] = atomic_add(
+                tvm.tir.call_intrin(
+                    "handle", "tir.address_of", valid_index[0]
+                ),
+                one_count,
+            )
+            out[tmp[0] * 2] = tvm.tir.floordiv(tid, a1)
+            out[tmp[0] * 2 + 1] = tvm.tir.floormod(tid, a1)
+
+    return ib.get()
+
+
+def argwhere_2d(output_shape, condition):
+    """Compute for argwhere 2D
+
+    Parameters
+    ----------
+    condition : list of int or tvm.tir.Any
+        The output shape
+
+    out : tvm.te.Tensor
+        Tensor with boolean values.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    condition_buf = tvm.tir.decl_buffer(
+        condition.shape, condition.dtype, "data_buf", data_alignment=8
+    )
+    out_buf = tvm.tir.decl_buffer(
+        output_shape, "int32", "out_buf", data_alignment=8
+    )
+
+    out = te.extern(
+        [output_shape],
+        [condition],
+        lambda ins, outs: argwhere_2d_ir(ins[0], outs[0]),
+        dtype=["int32"],
+        in_buffers=[condition_buf],
+        out_buffers=[out_buf],
+        name="argwhere_2d",
+        tag="argwhere2d_gpu",
+    )
+
+    return out
+
+
+def argwhere_3d_ir(condition, out):
+    """Low level IR for argwhere 3D
+
+    Parameters
+    ----------
+    condition : Buffer
+        The condition buffer.
+
+    out : Buffer
+        The output buffer.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.tir.ir_builder.create()
+    a0 = condition.shape[0]
+    a1 = condition.shape[1]
+    a2 = condition.shape[2]
+    s1 = a1 * a2
+    s0 = a0 * s1
+
+    condition = ib.buffer_ptr(condition)
+    out = ib.buffer_ptr(out)
+
+    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
+    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
+    one_count = tvm.tir.const(1, dtype="int32")
+
+    max_threads = 1024
+    nthread_tx = max_threads
+    nthread_bx = s0 // max_threads + 1
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    fdiv = tvm.tir.floordiv
+    fmod = tvm.tir.floormod
+
+    valid_index[0] = 0
+
+    with ib.if_scope(tid < s0):
+        with ib.if_scope(condition[tid] != 0):
+            tmp[0] = atomic_add(
+                tvm.tir.call_intrin(
+                    "handle", "tir.address_of", valid_index[0]
+                ),
+                one_count,
+            )
+            out[tmp[0] * 3] = fdiv(tid, s1)
+            out[tmp[0] * 3 + 1] = fdiv(fmod(tid, s1), a2)
+            out[tmp[0] * 3 + 2] = fmod(tid, a2)
+
+    return ib.get()
+
+
+def argwhere_3d(output_shape, condition):
+    """Compute for argwhere 3D
+
+    Parameters
+    ----------
+    condition : list of int or tvm.tir.Any
+        The output shape
+
+    out : tvm.te.Tensor
+        Tensor with boolean values.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    condition_buf = tvm.tir.decl_buffer(
+        condition.shape, condition.dtype, "data_buf", data_alignment=8
+    )
+    out_buf = tvm.tir.decl_buffer(
+        output_shape, "int32", "out_buf", data_alignment=8
+    )
+
+    out = te.extern(
+        [output_shape],
+        [condition],
+        lambda ins, outs: argwhere_3d_ir(ins[0], outs[0]),
+        dtype=["int32"],
+        in_buffers=[condition_buf],
+        out_buffers=[out_buf],
+        name="argwhere_3d",
+        tag="argwhere3d_gpu",
+    )
+
+    return out
+
+
+def argwhere_4d_ir(condition, out):
+    """Low level IR for argwhere 4D
+
+    Parameters
+    ----------
+    condition : Buffer
+        The condition buffer.
+
+    out : Buffer
+        The output buffer.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.tir.ir_builder.create()
+    a0 = condition.shape[0]
+    a1 = condition.shape[1]
+    a2 = condition.shape[2]
+    a3 = condition.shape[3]
+    s1 = a2 * a3
+    s2 = a1 * s1
+    s0 = a0 * s2
+
+    condition = ib.buffer_ptr(condition)
+    out = ib.buffer_ptr(out)
+
+    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
+    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
+    one_count = tvm.tir.const(1, dtype="int32")
+
+    max_threads = 1024
+    nthread_tx = max_threads
+    nthread_bx = s0 // max_threads + 1
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    fdiv = tvm.tir.floordiv
+    fmod = tvm.tir.floormod
+
+    valid_index[0] = 0
+
+    with ib.if_scope(tid < s0):
+        with ib.if_scope(condition[tid] != 0):
+            tmp[0] = atomic_add(
+                tvm.tir.call_intrin(
+                    "handle", "tir.address_of", valid_index[0]
+                ),
+                one_count,
+            )
+            out[tmp[0] * 4] = fdiv(tid, s2)
+            out[tmp[0] * 4 + 1] = fdiv(fmod(tid, s2), s1)
+            out[tmp[0] * 4 + 2] = fdiv(fmod(tid, s1), a3)
+            out[tmp[0] * 4 + 3] = fmod(tid, a3)
+
+    return ib.get()
+
+
+def argwhere_4d(output_shape, condition):
+    """Compute for argwhere 4D
+
+    Parameters
+    ----------
+    condition : list of int or tvm.tir.Any
+        The output shape
+
+    out : tvm.te.Tensor
+        Tensor with boolean values.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    condition_buf = tvm.tir.decl_buffer(
+        condition.shape, condition.dtype, "data_buf", data_alignment=8
+    )
+    out_buf = tvm.tir.decl_buffer(
+        output_shape, "int32", "out_buf", data_alignment=8
+    )
+
+    out = te.extern(
+        [output_shape],
+        [condition],
+        lambda ins, outs: argwhere_4d_ir(ins[0], outs[0]),
+        dtype=["int32"],
+        in_buffers=[condition_buf],
+        out_buffers=[out_buf],
+        name="argwhere_4d",
+        tag="argwhere4d_gpu",
+    )
+
+    return out
+
+
+def argwhere_5d_ir(condition, out):
+    """Low level IR for argwhere 5D
+
+    Parameters
+    ----------
+    condition : Buffer
+        The condition buffer.
+
+    out : Buffer
+        The output buffer.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.tir.ir_builder.create()
+    a0 = condition.shape[0]
+    a1 = condition.shape[1]
+    a2 = condition.shape[2]
+    a3 = condition.shape[3]
+    a4 = condition.shape[4]
+    s1 = a3 * a4
+    s2 = a2 * s1
+    s3 = a1 * s2
+    s0 = a0 * s3
+
+    condition = ib.buffer_ptr(condition)
+    out = ib.buffer_ptr(out)
+
+    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
+    tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
+    one_count = tvm.tir.const(1, dtype="int32")
+
+    max_threads = 1024
+    nthread_tx = max_threads
+    nthread_bx = s0 // max_threads + 1
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    fdiv = tvm.tir.floordiv
+    fmod = tvm.tir.floormod
+
+    valid_index[0] = 0
+
+    with ib.if_scope(tid < s0):
+        with ib.if_scope(condition[tid] != 0):
+            tmp[0] = atomic_add(
+                tvm.tir.call_intrin(
+                    "handle", "tir.address_of", valid_index[0]
+                ),
+                one_count,
+            )
+            out[tmp[0] * 5] = fdiv(tid, s3)
+            out[tmp[0] * 5 + 1] = fdiv(fmod(tid, s3), s2)
+            out[tmp[0] * 5 + 2] = fdiv(fmod(tid, s2), s1)
+            out[tmp[0] * 5 + 3] = fdiv(fmod(tid, s1), a4)
+            out[tmp[0] * 5 + 4] = fmod(tid, a4)
+
+    return ib.get()
+
+
+def argwhere_5d(output_shape, condition):
+    """Compute for argwhere 5D
+
+    Parameters
+    ----------
+    condition : list of int or tvm.tir.Any
+        The output shape
+
+    out : tvm.te.Tensor
+        Tensor with boolean values.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    condition_buf = tvm.tir.decl_buffer(
+        condition.shape, condition.dtype, "data_buf", data_alignment=8
+    )
+    out_buf = tvm.tir.decl_buffer(
+        output_shape, "int32", "out_buf", data_alignment=8
+    )
+
+    out = te.extern(
+        [output_shape],
+        [condition],
+        lambda ins, outs: argwhere_5d_ir(ins[0], outs[0]),
+        dtype=["int32"],
+        in_buffers=[condition_buf],
+        out_buffers=[out_buf],
+        name="argwhere_5d",
+        tag="argwhere5d_gpu",
+    )
+
+    return out
+
+
+def argwhere(output_shape, condition):
+    """Find the indices of elements of a tensor that are non-zero.
+
+    Parameters
+    ----------
+    output_shape : tvm.te.Tensor
+        Tensor with output shape info.
+
+    condition : tvm.te.Tensor
+        Tensor with boolean values.
+
+    Returns
+    -------
+    out : tvm.te.Tensor
+        Indices of non-zero elements.
+    """
+    if len(condition.shape) == 1:
+        return argwhere_1d(output_shape.shape, condition)
+    if len(condition.shape) == 2:
+        return argwhere_2d(output_shape.shape, condition)
+    if len(condition.shape) == 3:
+        return argwhere_3d(output_shape.shape, condition)
+    if len(condition.shape) == 4:
+        return argwhere_4d(output_shape.shape, condition)
+    if len(condition.shape) == 5:
+        return argwhere_5d(output_shape.shape, condition)
+    raise ValueError("Argwhere does not support rank higher than 5")
+
 
 def schedule_argwhere(outs):
     """Schedule for argwhere on cuda.
@@ -38,17 +537,4 @@ def schedule_argwhere(outs):
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     sch = te.create_schedule([x.op for x in outs])
 
-    def _schedule_argwhere(op):
-        if op in sch.outputs:
-            out = op
-        else:
-            out = outs[0].op.output(0)
-        fused = sch[out].fuse(*sch[out].op.axis)
-        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
-        bx, tx = sch[out].split(fused, factor=num_thread)
-        sch[out].bind(bx, te.thread_axis("blockIdx.x"))
-        sch[out].bind(tx, te.thread_axis("threadIdx.x"))
-
-    traverse_inline(sch, outs[0].op, _schedule_argwhere)
-
     return sch
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index d8e189588421..eb99fffe6f12 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -27,24 +27,28 @@
     "gpu": topi.cuda.schedule_argwhere,
 }
 
+_argwhere_compute = {
+    "llvm": topi.argwhere,
+    "cuda": topi.cuda.argwhere
+}
+
 def verify_argwhere(data_shape):
     dtype = "int32"
-    np_data = np.random.randint(5, size=data_shape).astype(dtype)
-    np_out = np.argwhere(np_data > 0)
+    np_data = np.random.choice([0, 1, 2, 3], size=data_shape).astype(dtype)
+    np_out = np.argwhere(np_data)
     out_shape = np_out.shape[0]
     np_shape = np.ones(shape=(out_shape, len(data_shape)), dtype=dtype)
 
     out_shape = te.placeholder(shape=(out_shape, len(data_shape)),
                                name="out_shape", dtype=dtype)
     condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
-    out = topi.argwhere(out_shape, condition)
 
     def check_device(device, ctx):
         ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
+        if not ctx.exist or device not in _argwhere_compute:
             return
 
+        out = _argwhere_compute[device](out_shape, condition)
         with tvm.target.create(device):
             s_func = tvm.topi.testing.dispatch(device, _argwhere_schedule)
             sch = s_func(out)
@@ -65,9 +69,11 @@ def check_device(device, ctx):
 @tvm.testing.uses_gpu
 def test_argwhere():
     verify_argwhere((1,))
-    verify_argwhere((5,))
+    verify_argwhere((100,))
     verify_argwhere((5, 3))
+    verify_argwhere((100, 100))
     verify_argwhere((6, 5, 3))
+    verify_argwhere((32, 32, 16))
     verify_argwhere((6, 4, 5, 3))
     verify_argwhere((6, 4, 5, 3, 7))
 

From b74d59faa1095fe8e70cda43e7caaa6e585c8978 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Sat, 12 Sep 2020 05:39:47 +0000
Subject: [PATCH 03/13] sort argwhere result

---
 python/tvm/topi/cuda/argwhere.py              | 96 +++++++++++++++++--
 .../python/topi/python/test_topi_argwhere.py  | 35 ++++---
 2 files changed, 109 insertions(+), 22 deletions(-)

diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index b8e346794ad7..9949ccfe0c48 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -19,7 +19,11 @@
 
 import tvm
 from tvm import te
+from .injective import schedule_injective_from_existing
 from .nms import atomic_add
+from .sort import topk, argsort
+from .. import tag
+from ..transform import strided_slice, adv_index, squeeze
 
 
 def argwhere_1d_ir(condition, out):
@@ -48,7 +52,9 @@ def argwhere_1d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = 1024
+    max_threads = int(
+        tvm.target.Target.current(allow_none=False).max_num_threads
+    )
     nthread_tx = max_threads
     nthread_bx = a0 // max_threads + 1
     tx = te.thread_axis("threadIdx.x")
@@ -105,7 +111,11 @@ def argwhere_1d(output_shape, condition):
         tag="argwhere1d_gpu",
     )
 
-    return out
+    sorted_out = topk(
+        out, k=0, axis=0, ret_type="values", is_ascend="True", dtype="int32"
+    )
+
+    return sorted_out
 
 
 def argwhere_2d_ir(condition, out):
@@ -135,7 +145,9 @@ def argwhere_2d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = 1024
+    max_threads = int(
+        tvm.target.Target.current(allow_none=False).max_num_threads
+    )
     nthread_tx = max_threads
     nthread_bx = (a0 * a1) // max_threads + 1
     tx = te.thread_axis("threadIdx.x")
@@ -194,7 +206,21 @@ def argwhere_2d(output_shape, condition):
         tag="argwhere2d_gpu",
     )
 
-    return out
+    if out.shape[0] <= 1:
+        return out
+
+    # sort the output from the least significant to the most significant
+    # column.
+    out1 = strided_slice(out, [0, 1], [out.shape[0], 2])
+    out2 = argsort(out1, axis=0, dtype="int32")
+    out3 = squeeze(out2)
+    out = adv_index(out, [out3])
+
+    out1 = strided_slice(out, [0, 0], [out.shape[0], 1])
+    out2 = argsort(out1, axis=0, dtype="int32")
+    out3 = squeeze(out2)
+
+    return adv_index(out, [out3])
 
 
 def argwhere_3d_ir(condition, out):
@@ -227,7 +253,9 @@ def argwhere_3d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = 1024
+    max_threads = int(
+        tvm.target.Target.current(allow_none=False).max_num_threads
+    )
     nthread_tx = max_threads
     nthread_bx = s0 // max_threads + 1
     tx = te.thread_axis("threadIdx.x")
@@ -289,6 +317,17 @@ def argwhere_3d(output_shape, condition):
         tag="argwhere3d_gpu",
     )
 
+    if out.shape[0] <= 1:
+        return out
+
+    # sort the output from the least significant to the most significant
+    # column.
+    for i in reversed(range(3)):
+        out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
+        out2 = argsort(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+        out = adv_index(out, [out3])
+
     return out
 
 
@@ -324,7 +363,9 @@ def argwhere_4d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = 1024
+    max_threads = int(
+        tvm.target.Target.current(allow_none=False).max_num_threads
+    )
     nthread_tx = max_threads
     nthread_bx = s0 // max_threads + 1
     tx = te.thread_axis("threadIdx.x")
@@ -387,6 +428,17 @@ def argwhere_4d(output_shape, condition):
         tag="argwhere4d_gpu",
     )
 
+    if out.shape[0] <= 1:
+        return out
+
+    # sort the output from the least significant to the most significant
+    # column.
+    for i in reversed(range(4)):
+        out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
+        out2 = argsort(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+        out = adv_index(out, [out3])
+
     return out
 
 
@@ -424,7 +476,9 @@ def argwhere_5d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = 1024
+    max_threads = int(
+        tvm.target.Target.current(allow_none=False).max_num_threads
+    )
     nthread_tx = max_threads
     nthread_bx = s0 // max_threads + 1
     tx = te.thread_axis("threadIdx.x")
@@ -488,6 +542,17 @@ def argwhere_5d(output_shape, condition):
         tag="argwhere5d_gpu",
     )
 
+    if out.shape[0] <= 1:
+        return out
+
+    # sort the output from the least significant to the most significant
+    # column.
+    for i in reversed(range(5)):
+        out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
+        out2 = argsort(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+        out = adv_index(out, [out3])
+
     return out
 
 
@@ -535,6 +600,17 @@ def schedule_argwhere(outs):
         The computation schedule for argwhere
     """
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    sch = te.create_schedule([x.op for x in outs])
-
-    return sch
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        if tag.is_injective(op.tag):
+            schedule_injective_from_existing(s, op.output(0))
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                traverse(tensor.op)
+        scheduled_ops.append(op)
+
+    for out in outs:
+        traverse(out.op)
+    return s
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index eb99fffe6f12..817585fef2c4 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -27,10 +27,8 @@
     "gpu": topi.cuda.schedule_argwhere,
 }
 
-_argwhere_compute = {
-    "llvm": topi.argwhere,
-    "cuda": topi.cuda.argwhere
-}
+_argwhere_compute = {"llvm": topi.argwhere, "cuda": topi.cuda.argwhere}
+
 
 def verify_argwhere(data_shape):
     dtype = "int32"
@@ -39,8 +37,9 @@ def verify_argwhere(data_shape):
     out_shape = np_out.shape[0]
     np_shape = np.ones(shape=(out_shape, len(data_shape)), dtype=dtype)
 
-    out_shape = te.placeholder(shape=(out_shape, len(data_shape)),
-                               name="out_shape", dtype=dtype)
+    out_shape = te.placeholder(
+        shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype
+    )
     condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
 
     def check_device(device, ctx):
@@ -48,18 +47,23 @@ def check_device(device, ctx):
         if not ctx.exist or device not in _argwhere_compute:
             return
 
-        out = _argwhere_compute[device](out_shape, condition)
-        with tvm.target.create(device):
+        with tvm.target.Target(device):
+            out = _argwhere_compute[device](out_shape, condition)
             s_func = tvm.topi.testing.dispatch(device, _argwhere_schedule)
             sch = s_func(out)
 
-        func = tvm.build(sch, [out_shape, condition, out], device,
-                         name="argwhere")
+        func = tvm.build(
+            sch, [out_shape, condition, out], device, name="argwhere"
+        )
+
+        # print(func.imported_modules[0].get_source())
 
         args = [tvm.nd.array(np_shape, ctx)]
         args.append(tvm.nd.array(np_data, ctx))
         args.append(tvm.nd.empty(out.shape, ctx=ctx, dtype=condition.dtype))
         func(*args)
+        np.set_printoptions(threshold=np.inf)
+        # print(args[-1].asnumpy())
         tvm.testing.assert_allclose(args[-1].asnumpy(), np.array(np_out))
 
     for target, ctx in tvm.testing.enabled_targets():
@@ -70,11 +74,18 @@ def check_device(device, ctx):
 def test_argwhere():
     verify_argwhere((1,))
     verify_argwhere((100,))
+    verify_argwhere((1, 1))
     verify_argwhere((5, 3))
-    verify_argwhere((100, 100))
+    verify_argwhere((32, 64))
+    # TODO(zhiics) This test is flaky because nothing is sorted.
+    verify_argwhere((128, 65))
     verify_argwhere((6, 5, 3))
-    verify_argwhere((32, 32, 16))
+    verify_argwhere((1, 1, 1))
+    # TODO(zhiics) This test is flaky.
+    # verify_argwhere((32, 32, 8))
+    verify_argwhere((1, 1, 1, 1))
     verify_argwhere((6, 4, 5, 3))
+    verify_argwhere((1, 1, 1, 1, 1))
     verify_argwhere((6, 4, 5, 3, 7))
 
 

From 6d83d85d91c93f666165f339257f2c3e2d12823b Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Thu, 5 Nov 2020 22:23:37 +0000
Subject: [PATCH 04/13] Use single block and thrust to fix flaky behavior

---
 python/tvm/topi/cuda/argwhere.py              | 203 +++++++++++-------
 python/tvm/topi/cuda/sort.py                  |   2 +
 .../python/topi/python/test_topi_argwhere.py  |   4 +-
 3 files changed, 123 insertions(+), 86 deletions(-)

diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 9949ccfe0c48..4791eb488c35 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -17,14 +17,34 @@
 # pylint: disable=too-many-arguments, invalid-name
 """Argwhere operator"""
 
+import logging
+
 import tvm
 from tvm import te
+from tvm._ffi import get_global_func
 from .injective import schedule_injective_from_existing
 from .nms import atomic_add
-from .sort import topk, argsort
+from .sort import topk, topk_thrust, argsort, argsort_thrust
 from .. import tag
 from ..transform import strided_slice, adv_index, squeeze
 
+logger = logging.getLogger("topi")
+
+
+def _get_sort_func(mode=0):
+    """Get sort function for argwhere. mode 0 for topk and others for argsort."""
+    if get_global_func(
+        "tvm.contrib.thrust.sort", allow_missing=True
+    ):
+        ret = topk_thrust if mode == 0 else argsort_thrust
+    else:
+        logger.warn("It's highly recommended to enable thrust library with set(USE_THRUST ON)"
+                    " when compiling argwhere for cuda target. Otherwise, it can result in"
+                    " significant performance degradation or incorrect result")    
+        ret = topk if mode == 0 else argsort
+
+    return ret
+
 
 def argwhere_1d_ir(condition, out):
     """Low level IR for argwhere 1D
@@ -48,7 +68,7 @@ def argwhere_1d_ir(condition, out):
     condition = ib.buffer_ptr(condition)
     out = ib.buffer_ptr(out)
 
-    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local")
+    valid_index = ib.allocate("int32", (1,), name="valid_index", scope="global")
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
@@ -56,23 +76,23 @@ def argwhere_1d_ir(condition, out):
         tvm.target.Target.current(allow_none=False).max_num_threads
     )
     nthread_tx = max_threads
-    nthread_bx = a0 // max_threads + 1
+    # Limit threads to a single block to make sure atomic_add works normally.
     tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
+    len_inner_for = a0 // nthread_tx + 1
     valid_index[0] = 0
 
-    with ib.if_scope(tid < a0):
-        with ib.if_scope(condition[tid] != 0):
-            tmp[0] = atomic_add(
-                tvm.tir.call_intrin(
-                    "handle", "tir.address_of", valid_index[0]
-                ),
-                one_count,
-            )
-            out[tmp[0]] = tid
+    with ib.for_range(0, len_inner_for, name="i") as i:
+        idx = tx * len_inner_for + i
+        with ib.if_scope(idx < a0):
+            with ib.if_scope(condition[idx] != 0):
+                tmp[0] = atomic_add(
+                    tvm.tir.call_intrin(
+                        "handle", "tir.address_of", valid_index[0]
+                    ),
+                    one_count,
+                )
+                out[tmp[0]] = idx
 
     return ib.get()
 
@@ -111,7 +131,10 @@ def argwhere_1d(output_shape, condition):
         tag="argwhere1d_gpu",
     )
 
-    sorted_out = topk(
+    if out.shape[0] <= 1:
+        return out
+
+    sorted_out = _get_sort_func()(
         out, k=0, axis=0, ret_type="values", is_ascend="True", dtype="int32"
     )
 
@@ -138,6 +161,8 @@ def argwhere_2d_ir(condition, out):
     a0 = condition.shape[0]
     a1 = condition.shape[1]
 
+    out_len = out.shape[0] * out.shape[1]
+
     condition = ib.buffer_ptr(condition)
     out = ib.buffer_ptr(out)
 
@@ -149,25 +174,26 @@ def argwhere_2d_ir(condition, out):
         tvm.target.Target.current(allow_none=False).max_num_threads
     )
     nthread_tx = max_threads
-    nthread_bx = (a0 * a1) // max_threads + 1
+
+    # Limit threads to a single block to make sure atomic_add works normally.
     tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
+    len_inner_for = (a0 * a1) // nthread_tx + 1
 
     valid_index[0] = 0
 
-    with ib.if_scope(tid < (a0 * a1)):
-        with ib.if_scope(condition[tid] != 0):
-            tmp[0] = atomic_add(
-                tvm.tir.call_intrin(
-                    "handle", "tir.address_of", valid_index[0]
-                ),
-                one_count,
-            )
-            out[tmp[0] * 2] = tvm.tir.floordiv(tid, a1)
-            out[tmp[0] * 2 + 1] = tvm.tir.floormod(tid, a1)
+    with ib.for_range(0, len_inner_for, name="i") as i:
+        idx = tx * len_inner_for + i
+        with ib.if_scope(idx < (a0 * a1)):
+            with ib.if_scope(condition[idx] != 0):
+                tmp[0] = atomic_add(
+                    tvm.tir.call_intrin(
+                        "handle", "tir.address_of", valid_index[0]
+                    ),
+                    one_count,
+                )
+                out[tmp[0] * 2] = tvm.tir.floordiv(idx, a1)
+                out[tmp[0] * 2 + 1] = tvm.tir.floormod(idx, a1)
 
     return ib.get()
 
@@ -209,15 +235,17 @@ def argwhere_2d(output_shape, condition):
     if out.shape[0] <= 1:
         return out
 
+    sort_func = _get_sort_func(1)
+
     # sort the output from the least significant to the most significant
     # column.
     out1 = strided_slice(out, [0, 1], [out.shape[0], 2])
-    out2 = argsort(out1, axis=0, dtype="int32")
+    out2 = sort_func(out1, axis=0, dtype="int32")
     out3 = squeeze(out2)
     out = adv_index(out, [out3])
 
     out1 = strided_slice(out, [0, 0], [out.shape[0], 1])
-    out2 = argsort(out1, axis=0, dtype="int32")
+    out2 = sort_func(out1, axis=0, dtype="int32")
     out3 = squeeze(out2)
 
     return adv_index(out, [out3])
@@ -257,28 +285,30 @@ def argwhere_3d_ir(condition, out):
         tvm.target.Target.current(allow_none=False).max_num_threads
     )
     nthread_tx = max_threads
-    nthread_bx = s0 // max_threads + 1
+
+    # Limit threads to a single block to make sure atomic_add works normally.
     tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
+    len_inner_for = s0 // nthread_tx + 1
+
     fdiv = tvm.tir.floordiv
     fmod = tvm.tir.floormod
 
     valid_index[0] = 0
 
-    with ib.if_scope(tid < s0):
-        with ib.if_scope(condition[tid] != 0):
-            tmp[0] = atomic_add(
-                tvm.tir.call_intrin(
-                    "handle", "tir.address_of", valid_index[0]
-                ),
-                one_count,
-            )
-            out[tmp[0] * 3] = fdiv(tid, s1)
-            out[tmp[0] * 3 + 1] = fdiv(fmod(tid, s1), a2)
-            out[tmp[0] * 3 + 2] = fmod(tid, a2)
+    with ib.for_range(0, len_inner_for, name="i") as i:
+        idx = tx * len_inner_for + i
+        with ib.if_scope(idx < s0):
+            with ib.if_scope(condition[idx] != 0):
+                tmp[0] = atomic_add(
+                    tvm.tir.call_intrin(
+                        "handle", "tir.address_of", valid_index[0]
+                    ),
+                    one_count,
+                )
+                out[tmp[0] * 3] = fdiv(idx, s1)
+                out[tmp[0] * 3 + 1] = fdiv(fmod(idx, s1), a2)
+                out[tmp[0] * 3 + 2] = fmod(idx, a2)
 
     return ib.get()
 
@@ -322,9 +352,10 @@ def argwhere_3d(output_shape, condition):
 
     # sort the output from the least significant to the most significant
     # column.
+    sort_func = _get_sort_func(1)
     for i in reversed(range(3)):
         out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-        out2 = argsort(out1, axis=0, dtype="int32")
+        out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
         out = adv_index(out, [out3])
 
@@ -367,29 +398,31 @@ def argwhere_4d_ir(condition, out):
         tvm.target.Target.current(allow_none=False).max_num_threads
     )
     nthread_tx = max_threads
-    nthread_bx = s0 // max_threads + 1
+    
+    # Limit threads to a single block to make sure atomic_add works normally.
     tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
+    len_inner_for = s0 // nthread_tx + 1    
+
     fdiv = tvm.tir.floordiv
     fmod = tvm.tir.floormod
 
     valid_index[0] = 0
 
-    with ib.if_scope(tid < s0):
-        with ib.if_scope(condition[tid] != 0):
-            tmp[0] = atomic_add(
-                tvm.tir.call_intrin(
-                    "handle", "tir.address_of", valid_index[0]
-                ),
-                one_count,
-            )
-            out[tmp[0] * 4] = fdiv(tid, s2)
-            out[tmp[0] * 4 + 1] = fdiv(fmod(tid, s2), s1)
-            out[tmp[0] * 4 + 2] = fdiv(fmod(tid, s1), a3)
-            out[tmp[0] * 4 + 3] = fmod(tid, a3)
+    with ib.for_range(0, len_inner_for, name="i") as i:
+        idx = tx * len_inner_for + i
+        with ib.if_scope(idx < s0):
+            with ib.if_scope(condition[idx] != 0):
+                tmp[0] = atomic_add(
+                    tvm.tir.call_intrin(
+                        "handle", "tir.address_of", valid_index[0]
+                    ),
+                    one_count,
+                )
+                out[tmp[0] * 4] = fdiv(idx, s2)
+                out[tmp[0] * 4 + 1] = fdiv(fmod(idx, s2), s1)
+                out[tmp[0] * 4 + 2] = fdiv(fmod(idx, s1), a3)
+                out[tmp[0] * 4 + 3] = fmod(idx, a3)
 
     return ib.get()
 
@@ -433,9 +466,10 @@ def argwhere_4d(output_shape, condition):
 
     # sort the output from the least significant to the most significant
     # column.
+    sort_func = _get_sort_func(1)
     for i in reversed(range(4)):
         out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-        out2 = argsort(out1, axis=0, dtype="int32")
+        out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
         out = adv_index(out, [out3])
 
@@ -480,30 +514,32 @@ def argwhere_5d_ir(condition, out):
         tvm.target.Target.current(allow_none=False).max_num_threads
     )
     nthread_tx = max_threads
-    nthread_bx = s0 // max_threads + 1
+
+    # Limit threads to a single block to make sure atomic_add works normally.
     tx = te.thread_axis("threadIdx.x")
-    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
+    len_inner_for = s0 // nthread_tx + 1
+
     fdiv = tvm.tir.floordiv
     fmod = tvm.tir.floormod
 
     valid_index[0] = 0
 
-    with ib.if_scope(tid < s0):
-        with ib.if_scope(condition[tid] != 0):
-            tmp[0] = atomic_add(
-                tvm.tir.call_intrin(
-                    "handle", "tir.address_of", valid_index[0]
-                ),
-                one_count,
-            )
-            out[tmp[0] * 5] = fdiv(tid, s3)
-            out[tmp[0] * 5 + 1] = fdiv(fmod(tid, s3), s2)
-            out[tmp[0] * 5 + 2] = fdiv(fmod(tid, s2), s1)
-            out[tmp[0] * 5 + 3] = fdiv(fmod(tid, s1), a4)
-            out[tmp[0] * 5 + 4] = fmod(tid, a4)
+    with ib.for_range(0, len_inner_for, name="i") as i:
+        idx = tx * len_inner_for + i
+        with ib.if_scope(idx < s0):
+            with ib.if_scope(condition[idx] != 0):
+                tmp[0] = atomic_add(
+                    tvm.tir.call_intrin(
+                        "handle", "tir.address_of", valid_index[0]
+                    ),
+                    one_count,
+                )
+                out[tmp[0] * 5] = fdiv(idx, s3)
+                out[tmp[0] * 5 + 1] = fdiv(fmod(idx, s3), s2)
+                out[tmp[0] * 5 + 2] = fdiv(fmod(idx, s2), s1)
+                out[tmp[0] * 5 + 3] = fdiv(fmod(idx, s1), a4)
+                out[tmp[0] * 5 + 4] = fmod(idx, a4)
 
     return ib.get()
 
@@ -547,9 +583,10 @@ def argwhere_5d(output_shape, condition):
 
     # sort the output from the least significant to the most significant
     # column.
+    sort_func = _get_sort_func(1)
     for i in reversed(range(5)):
         out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-        out2 = argsort(out1, axis=0, dtype="int32")
+        out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
         out = adv_index(out, [out3])
 
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index ac14f5aae779..2a7f4eb92daa 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -550,6 +550,8 @@ def topk_thrust(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int
         tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8),
     ]
 
+    is_ascend = 1 if is_ascend else 0
+
     out = te.extern(
         [data.shape, data.shape],
         [data],
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index 817585fef2c4..5181e45b8b5d 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -77,12 +77,10 @@ def test_argwhere():
     verify_argwhere((1, 1))
     verify_argwhere((5, 3))
     verify_argwhere((32, 64))
-    # TODO(zhiics) This test is flaky because nothing is sorted.
     verify_argwhere((128, 65))
+    verify_argwhere((200, 500))
     verify_argwhere((6, 5, 3))
     verify_argwhere((1, 1, 1))
-    # TODO(zhiics) This test is flaky.
-    # verify_argwhere((32, 32, 8))
     verify_argwhere((1, 1, 1, 1))
     verify_argwhere((6, 4, 5, 3))
     verify_argwhere((1, 1, 1, 1, 1))

From 17168463e32a31e9518b28f056c39ce7895f30c6 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 6 Nov 2020 19:19:04 +0000
Subject: [PATCH 05/13] format

---
 python/tvm/relay/op/strategy/cuda.py          |  4 +-
 python/tvm/relay/op/strategy/generic.py       | 12 ++-
 python/tvm/topi/cuda/argwhere.py              | 78 ++++++-------------
 .../python/topi/python/test_topi_argwhere.py  |  8 +-
 4 files changed, 37 insertions(+), 65 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 3a935d3288cf..fc80c9ed6171 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -922,6 +922,7 @@ def correlation_strategy_cuda(attrs, inputs, out_type, target):
     )
     return strategy
 
+
 @argwhere_strategy.register(["cuda", "gpu"])
 def argwhere_strategy_cuda(attrs, inputs, out_type, target):
     """argwhere cuda strategy"""
@@ -929,5 +930,6 @@ def argwhere_strategy_cuda(attrs, inputs, out_type, target):
     strategy.add_implementation(
         wrap_compute_argwhere(topi.cuda.argwhere),
         wrap_topi_schedule(topi.cuda.schedule_argwhere),
-        name="argwhere.cuda")
+        name="argwhere.cuda",
+    )
     return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index f90ed6e1a220..15c7f2f7fa17 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1216,9 +1216,11 @@ def correlation_strategy(attrs, inputs, out_type, target):
     )
     return strategy
 
+
 # argwhere
 def wrap_compute_argwhere(topi_compute):
     """wrap argwhere topi compute"""
+
     def _compute_argwhere(attrs, inputs, out_type):
         output_shape = []
         for s in out_type.shape:
@@ -1228,13 +1230,17 @@ def _compute_argwhere(attrs, inputs, out_type):
                 output_shape.append(te.var("any_dim", "int32"))
         new_output_type = ir.TensorType(output_shape, "int32")
         return [topi_compute(new_output_type, inputs[0])]
+
     return _compute_argwhere
 
+
 @override_native_generic_func("argwhere_strategy")
 def argwhere_strategy(attrs, inputs, out_type, target):
     """argwhere generic strategy"""
     strategy = _op.OpStrategy()
-    strategy.add_implementation(wrap_compute_argwhere(topi.argwhere),
-                                wrap_topi_schedule(topi.generic.schedule_argwhere),
-                                name="argwhere.generic")
+    strategy.add_implementation(
+        wrap_compute_argwhere(topi.argwhere),
+        wrap_topi_schedule(topi.generic.schedule_argwhere),
+        name="argwhere.generic",
+    )
     return strategy
diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 4791eb488c35..17d2410a517f 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -33,14 +33,14 @@
 
 def _get_sort_func(mode=0):
     """Get sort function for argwhere. mode 0 for topk and others for argsort."""
-    if get_global_func(
-        "tvm.contrib.thrust.sort", allow_missing=True
-    ):
+    if get_global_func("tvm.contrib.thrust.sort", allow_missing=True):
         ret = topk_thrust if mode == 0 else argsort_thrust
     else:
-        logger.warn("It's highly recommended to enable thrust library with set(USE_THRUST ON)"
-                    " when compiling argwhere for cuda target. Otherwise, it can result in"
-                    " significant performance degradation or incorrect result")    
+        logger.warn(
+            "It's highly recommended to enable thrust library with set(USE_THRUST ON)"
+            " when compiling argwhere for cuda target. Otherwise, it can result in"
+            " significant performance degradation or incorrect result"
+        )
         ret = topk if mode == 0 else argsort
 
     return ret
@@ -72,9 +72,7 @@ def argwhere_1d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = int(
-        tvm.target.Target.current(allow_none=False).max_num_threads
-    )
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     # Limit threads to a single block to make sure atomic_add works normally.
     tx = te.thread_axis("threadIdx.x")
@@ -87,9 +85,7 @@ def argwhere_1d_ir(condition, out):
         with ib.if_scope(idx < a0):
             with ib.if_scope(condition[idx] != 0):
                 tmp[0] = atomic_add(
-                    tvm.tir.call_intrin(
-                        "handle", "tir.address_of", valid_index[0]
-                    ),
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
                     one_count,
                 )
                 out[tmp[0]] = idx
@@ -116,9 +112,7 @@ def argwhere_1d(output_shape, condition):
     condition_buf = tvm.tir.decl_buffer(
         condition.shape, condition.dtype, "data_buf", data_alignment=8
     )
-    out_buf = tvm.tir.decl_buffer(
-        output_shape, "int32", "out_buf", data_alignment=8
-    )
+    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
 
     out = te.extern(
         [output_shape],
@@ -161,8 +155,6 @@ def argwhere_2d_ir(condition, out):
     a0 = condition.shape[0]
     a1 = condition.shape[1]
 
-    out_len = out.shape[0] * out.shape[1]
-
     condition = ib.buffer_ptr(condition)
     out = ib.buffer_ptr(out)
 
@@ -170,9 +162,7 @@ def argwhere_2d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = int(
-        tvm.target.Target.current(allow_none=False).max_num_threads
-    )
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
 
     # Limit threads to a single block to make sure atomic_add works normally.
@@ -187,9 +177,7 @@ def argwhere_2d_ir(condition, out):
         with ib.if_scope(idx < (a0 * a1)):
             with ib.if_scope(condition[idx] != 0):
                 tmp[0] = atomic_add(
-                    tvm.tir.call_intrin(
-                        "handle", "tir.address_of", valid_index[0]
-                    ),
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
                     one_count,
                 )
                 out[tmp[0] * 2] = tvm.tir.floordiv(idx, a1)
@@ -217,9 +205,7 @@ def argwhere_2d(output_shape, condition):
     condition_buf = tvm.tir.decl_buffer(
         condition.shape, condition.dtype, "data_buf", data_alignment=8
     )
-    out_buf = tvm.tir.decl_buffer(
-        output_shape, "int32", "out_buf", data_alignment=8
-    )
+    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
 
     out = te.extern(
         [output_shape],
@@ -281,9 +267,7 @@ def argwhere_3d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = int(
-        tvm.target.Target.current(allow_none=False).max_num_threads
-    )
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
 
     # Limit threads to a single block to make sure atomic_add works normally.
@@ -301,9 +285,7 @@ def argwhere_3d_ir(condition, out):
         with ib.if_scope(idx < s0):
             with ib.if_scope(condition[idx] != 0):
                 tmp[0] = atomic_add(
-                    tvm.tir.call_intrin(
-                        "handle", "tir.address_of", valid_index[0]
-                    ),
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
                     one_count,
                 )
                 out[tmp[0] * 3] = fdiv(idx, s1)
@@ -332,9 +314,7 @@ def argwhere_3d(output_shape, condition):
     condition_buf = tvm.tir.decl_buffer(
         condition.shape, condition.dtype, "data_buf", data_alignment=8
     )
-    out_buf = tvm.tir.decl_buffer(
-        output_shape, "int32", "out_buf", data_alignment=8
-    )
+    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
 
     out = te.extern(
         [output_shape],
@@ -394,15 +374,13 @@ def argwhere_4d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = int(
-        tvm.target.Target.current(allow_none=False).max_num_threads
-    )
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
-    
+
     # Limit threads to a single block to make sure atomic_add works normally.
     tx = te.thread_axis("threadIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
-    len_inner_for = s0 // nthread_tx + 1    
+    len_inner_for = s0 // nthread_tx + 1
 
     fdiv = tvm.tir.floordiv
     fmod = tvm.tir.floormod
@@ -414,9 +392,7 @@ def argwhere_4d_ir(condition, out):
         with ib.if_scope(idx < s0):
             with ib.if_scope(condition[idx] != 0):
                 tmp[0] = atomic_add(
-                    tvm.tir.call_intrin(
-                        "handle", "tir.address_of", valid_index[0]
-                    ),
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
                     one_count,
                 )
                 out[tmp[0] * 4] = fdiv(idx, s2)
@@ -446,9 +422,7 @@ def argwhere_4d(output_shape, condition):
     condition_buf = tvm.tir.decl_buffer(
         condition.shape, condition.dtype, "data_buf", data_alignment=8
     )
-    out_buf = tvm.tir.decl_buffer(
-        output_shape, "int32", "out_buf", data_alignment=8
-    )
+    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
 
     out = te.extern(
         [output_shape],
@@ -510,9 +484,7 @@ def argwhere_5d_ir(condition, out):
     tmp = ib.allocate("int32", (1,), name="tmp", scope="local")
     one_count = tvm.tir.const(1, dtype="int32")
 
-    max_threads = int(
-        tvm.target.Target.current(allow_none=False).max_num_threads
-    )
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
 
     # Limit threads to a single block to make sure atomic_add works normally.
@@ -530,9 +502,7 @@ def argwhere_5d_ir(condition, out):
         with ib.if_scope(idx < s0):
             with ib.if_scope(condition[idx] != 0):
                 tmp[0] = atomic_add(
-                    tvm.tir.call_intrin(
-                        "handle", "tir.address_of", valid_index[0]
-                    ),
+                    tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]),
                     one_count,
                 )
                 out[tmp[0] * 5] = fdiv(idx, s3)
@@ -563,9 +533,7 @@ def argwhere_5d(output_shape, condition):
     condition_buf = tvm.tir.decl_buffer(
         condition.shape, condition.dtype, "data_buf", data_alignment=8
     )
-    out_buf = tvm.tir.decl_buffer(
-        output_shape, "int32", "out_buf", data_alignment=8
-    )
+    out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8)
 
     out = te.extern(
         [output_shape],
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index 5181e45b8b5d..b9555ff48f08 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -37,9 +37,7 @@ def verify_argwhere(data_shape):
     out_shape = np_out.shape[0]
     np_shape = np.ones(shape=(out_shape, len(data_shape)), dtype=dtype)
 
-    out_shape = te.placeholder(
-        shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype
-    )
+    out_shape = te.placeholder(shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype)
     condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
 
     def check_device(device, ctx):
@@ -52,9 +50,7 @@ def check_device(device, ctx):
             s_func = tvm.topi.testing.dispatch(device, _argwhere_schedule)
             sch = s_func(out)
 
-        func = tvm.build(
-            sch, [out_shape, condition, out], device, name="argwhere"
-        )
+        func = tvm.build(sch, [out_shape, condition, out], device, name="argwhere")
 
         # print(func.imported_modules[0].get_source())
 

From 18e10bfd52a220d3ec9addb5fe93b60dc8ab2cd0 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 13 Nov 2020 16:48:33 +0000
Subject: [PATCH 06/13] used dynamic strided_slice

---
 python/tvm/topi/cuda/argwhere.py              | 67 ++++++++++++++++---
 .../python/topi/python/test_topi_argwhere.py  |  2 +-
 2 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 17d2410a517f..9ec532930f21 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -27,6 +27,7 @@
 from .sort import topk, topk_thrust, argsort, argsort_thrust
 from .. import tag
 from ..transform import strided_slice, adv_index, squeeze
+from ..utils import const_vector
 
 logger = logging.getLogger("topi")
 
@@ -36,7 +37,7 @@ def _get_sort_func(mode=0):
     if get_global_func("tvm.contrib.thrust.sort", allow_missing=True):
         ret = topk_thrust if mode == 0 else argsort_thrust
     else:
-        logger.warn(
+        logger.warning(
             "It's highly recommended to enable thrust library with set(USE_THRUST ON)"
             " when compiling argwhere for cuda target. Otherwise, it can result in"
             " significant performance degradation or incorrect result"
@@ -46,6 +47,17 @@ def _get_sort_func(mode=0):
     return ret
 
 
+def _create_end(data, out, end):
+    ib = tvm.tir.ir_builder.create()
+    end = tvm.tir.const(end, dtype=out.dtype)
+    out_ptr = ib.buffer_ptr(out)
+    bx = te.thread_axis("blockIdx.x")
+    ib.scope_attr(bx, "thread_extent", 1)
+    out_ptr[0] = data.shape[0]
+    out_ptr[1] = end
+    return ib.get()
+
+
 def argwhere_1d_ir(condition, out):
     """Low level IR for argwhere 1D
 
@@ -125,7 +137,7 @@ def argwhere_1d(output_shape, condition):
         tag="argwhere1d_gpu",
     )
 
-    if out.shape[0] <= 1:
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
         return out
 
     sorted_out = _get_sort_func()(
@@ -218,23 +230,56 @@ def argwhere_2d(output_shape, condition):
         tag="argwhere2d_gpu",
     )
 
-    if out.shape[0] <= 1:
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
         return out
 
     sort_func = _get_sort_func(1)
 
     # sort the output from the least significant to the most significant
     # column.
-    out1 = strided_slice(out, [0, 1], [out.shape[0], 2])
-    out2 = sort_func(out1, axis=0, dtype="int32")
-    out3 = squeeze(out2)
-    out = adv_index(out, [out3])
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
+        out1 = strided_slice(out, [0, 1], [out.shape[0], 2])
+        out2 = sort_func(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+        out = adv_index(out, [out3])
 
-    out1 = strided_slice(out, [0, 0], [out.shape[0], 1])
-    out2 = sort_func(out1, axis=0, dtype="int32")
-    out3 = squeeze(out2)
+        out1 = strided_slice(out, [0, 0], [out.shape[0], 1])
+        out2 = sort_func(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+
+        return adv_index(out, [out3])
+    else:
+        out_shape = [2]
+        out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
+        end = te.extern(
+            [out_shape],
+            [out],
+            lambda ins, outs: _create_end(ins[0], outs[0], 2),
+            dtype="int32",
+            out_buffers=[out_buf],
+            name="strided_slice_gpu_end0",
+            tag="strided_slice_gpu_end0",
+        )
+        out1 = strided_slice(out, const_vector([0, 1]), end)
+        out2 = sort_func(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+        out = adv_index(out, [out3])
+
+        out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
+        end = te.extern(
+            [out_shape],
+            [out],
+            lambda ins, outs: _create_end(ins[0], outs[0], 1),
+            dtype="int32",
+            out_buffers=[out_buf],
+            name="strided_slice_gpu_end1",
+            tag="strided_slice_gpu_end1",
+        )
+        out1 = strided_slice(out, const_vector([0, 0]), end)
+        out2 = sort_func(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
 
-    return adv_index(out, [out3])
+        return adv_index(out, [out3])
 
 
 def argwhere_3d_ir(condition, out):
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index b9555ff48f08..21dc6a2864ee 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -52,7 +52,7 @@ def check_device(device, ctx):
 
         func = tvm.build(sch, [out_shape, condition, out], device, name="argwhere")
 
-        # print(func.imported_modules[0].get_source())
+        print(func.imported_modules[0].get_source())
 
         args = [tvm.nd.array(np_shape, ctx)]
         args.append(tvm.nd.array(np_data, ctx))

From 6824edf740a39b1075261e8254566cd714b2880e Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Wed, 2 Dec 2020 01:54:08 +0000
Subject: [PATCH 07/13] Fix dynamic strided_slice

---
 include/tvm/topi/transform.h   | 22 ++++++++++++++++++++++
 python/tvm/topi/cuda/sort.py   |  4 +++-
 python/tvm/topi/transform.py   |  3 +++
 src/topi/transform.cc          |  4 ++++
 tests/python/relay/test_any.py | 10 +++++++++-
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index a04762f28feb..b7f6304fdd29 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -584,6 +584,28 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
       name, tag);
 }
 
+inline te::Tensor dynamic_strided_slice1(const te::Tensor& x, const Array<PrimExpr>& begin,
+                                         const Array<PrimExpr>& end, const Array<PrimExpr>& strides,
+                                         std::string name = "T_strided_slice_dynamic",
+                                         std::string tag = topi::kInjective) {
+  int64_t src_tensor_dim = x->shape.size();
+  Array<PrimExpr> out_shape;
+  for (int64_t i = 0; i < src_tensor_dim; ++i) {
+    out_shape.push_back(tvm::tir::Var("dim"));
+  }
+  return te::compute(
+      out_shape,
+      [&](const Array<tvm::tir::Var>& indices) {
+        Array<PrimExpr> real_indices;
+        for (int32_t i = 0; i < src_tensor_dim; ++i) {
+          real_indices.push_back(indices[i] * strides[i] + begin[i]);
+        }
+        return x(real_indices);
+      },
+      name, tag);
+}
+
+
 /*!
  * \brief strided_slice of a tensor
  *
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 2a7f4eb92daa..537b50710abf 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -21,8 +21,9 @@
 
 from .injective import schedule_injective_from_existing
 from ..math import identity
-from ..transform import strided_slice, transpose
+from ..transform import strided_slice, transpose, dynamic_strided_slice1
 from .. import tag
+from ..tensor import full
 
 
 def swap(arr, axis):
@@ -455,6 +456,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
     out : tvm.te.Tensor or List[tvm.te.Tensor]
         The computed result.
     """
+    return topk_thrust(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64")
     assert ret_type in ["both", "values", "indices"]
     ndim = len(data.shape)
     axis = axis + ndim if axis < 0 else axis
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 6ddbc73e4666..40d97ff21d26 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -218,6 +218,9 @@ def strided_slice(a, begin, end, strides=None, slice_mode="end"):
         strides = []
     return cpp.strided_slice(a, begin, end, strides, slice_mode)
 
+def dynamic_strided_slice1(a, begin, end, strides):
+    return cpp.dynamic_strided_slice1(a, begin, end, strides)
+
 
 @tvm.te.tag_scope(tag=tag.INJECTIVE + ",strided_set")
 def strided_set(a, v, begin, end, strides=None):
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index e1e3988f6400..d61790fb1091 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -173,6 +173,10 @@ TVM_REGISTER_GLOBAL("topi.dynamic_strided_slice").set_body([](TVMArgs args, TVMR
   *rv = dynamic_strided_slice(args[0], args[1], args[2], args[3]);
 });
 
+TVM_REGISTER_GLOBAL("topi.dynamic_strided_slice1").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = dynamic_strided_slice1(args[0], args[1], args[2], args[3]);
+});
+
 TVM_REGISTER_GLOBAL("topi.one_hot").set_body([](TVMArgs args, TVMRetValue* rv) {
   int depth = args[3];
   int axis = args[4];
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 3cd41d9e1961..e62b299a8639 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -43,6 +43,8 @@ def check_result(
     for kind in ["debug", "vm"]:
         targets = targets or tvm.testing.enabled_targets()
         for tgt, ctx in targets:
+            if "nvptx" in tgt:
+                continue
             if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type):
                 continue
             ex = relay.create_executor(kind, mod=mod, ctx=ctx, target=tgt)
@@ -809,10 +811,15 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False):
         ref_out = sorted[0:kval]
 
     check_result(in_vals, mod, ref_out)
+<<<<<<< 18e10bfd52a220d3ec9addb5fe93b60dc8ab2cd0
 
 
 # TODO(kevinthesun): enable this test when Thrust is available in ci.
 # @tvm.testing.uses_gpu
+=======
+
+@tvm.testing.uses_gpu
+>>>>>>> Fix dynamic strided_slice
 def test_any_topk():
     verify_any_topk(any_dims(1), 5, (10,), "float32")
     verify_any_topk(any_dims(2), 2, (6, 3), "int32")
@@ -1363,4 +1370,5 @@ def test_any_where():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    #pytest.main([__file__])
+    test_any_topk()

From 689732a9447b194fedd524ad9852c81e3be2a143 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Wed, 2 Dec 2020 19:25:46 +0000
Subject: [PATCH 08/13] try new strided_slice

---
 python/tvm/topi/cuda/argwhere.py      | 66 ++++++++++++++-------------
 src/tir/transforms/make_packed_api.cc |  1 +
 tests/python/relay/test_any.py        |  2 +-
 3 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 9ec532930f21..400495b99a08 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -26,7 +26,7 @@
 from .nms import atomic_add
 from .sort import topk, topk_thrust, argsort, argsort_thrust
 from .. import tag
-from ..transform import strided_slice, adv_index, squeeze
+from ..transform import strided_slice, adv_index, squeeze, dynamic_strided_slice1
 from ..utils import const_vector
 
 logger = logging.getLogger("topi")
@@ -249,37 +249,39 @@ def argwhere_2d(output_shape, condition):
 
         return adv_index(out, [out3])
     else:
-        out_shape = [2]
-        out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
-        end = te.extern(
-            [out_shape],
-            [out],
-            lambda ins, outs: _create_end(ins[0], outs[0], 2),
-            dtype="int32",
-            out_buffers=[out_buf],
-            name="strided_slice_gpu_end0",
-            tag="strided_slice_gpu_end0",
-        )
-        out1 = strided_slice(out, const_vector([0, 1]), end)
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-        out = adv_index(out, [out3])
-
-        out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
-        end = te.extern(
-            [out_shape],
-            [out],
-            lambda ins, outs: _create_end(ins[0], outs[0], 1),
-            dtype="int32",
-            out_buffers=[out_buf],
-            name="strided_slice_gpu_end1",
-            tag="strided_slice_gpu_end1",
-        )
-        out1 = strided_slice(out, const_vector([0, 0]), end)
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-
-        return adv_index(out, [out3])
+        # out_shape = [2]
+        # out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
+        # end = te.extern(
+        #     [out_shape],
+        #     [out],
+        #     lambda ins, outs: _create_end(ins[0], outs[0], 2),
+        #     dtype="int32",
+        #     out_buffers=[out_buf],
+        #     name="strided_slice_gpu_end0",
+        #     tag="strided_slice_gpu_end0",
+        # )
+        return dynamic_strided_slice1(out, [0, 1], [-1, -1], [1, 1])
+        # out1 = dynamic_strided_slice1(out, [0, 1], [-1, -1])
+        # out1 = strided_slice(out, const_vector([0, 1]), end)
+        # out2 = sort_func(out1, axis=0, dtype="int32")
+        # out3 = squeeze(out2)
+        # out = adv_index(out, [out3])
+
+        # out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
+        # end = te.extern(
+        #     [out_shape],
+        #     [out],
+        #     lambda ins, outs: _create_end(ins[0], outs[0], 1),
+        #     dtype="int32",
+        #     out_buffers=[out_buf],
+        #     name="strided_slice_gpu_end1",
+        #     tag="strided_slice_gpu_end1",
+        # )
+        # out1 = strided_slice(out, const_vector([0, 0]), end)
+        # out2 = sort_func(out1, axis=0, dtype="int32")
+        # out3 = squeeze(out2)
+
+        # return adv_index(out, [out3])
 
 
 def argwhere_3d_ir(condition, out):
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 7c4a8ef92724..23756fb307de 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -46,6 +46,7 @@ inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
 }
 
 PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
+  LOG(INFO) << AsText(func, false);
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute";
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index e62b299a8639..467679a6570e 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -226,7 +226,7 @@ def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"):
 
 @tvm.testing.uses_gpu
 def test_any_argwhere():
-    verify_any_argwhere(any_dims(1), (5,))
+    # verify_any_argwhere(any_dims(1), (5,))
     verify_any_argwhere(any_dims(2), (5, 5))
     verify_any_argwhere(any_dims(3), (5, 5, 5))
     verify_any_argwhere(any_dims(4), (5, 5, 5, 5))

From 2524663d92e48c5db87723902d4b2ee0e2312242 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Wed, 2 Dec 2020 20:26:48 +0000
Subject: [PATCH 09/13] Improve dynamic strided slice to bind data depedent
 shape var.

---
 include/tvm/topi/transform.h     |  2 +-
 python/tvm/topi/cuda/argwhere.py | 10 +++++++++-
 tests/python/relay/test_any.py   | 16 +++++++---------
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index b7f6304fdd29..b5b0c4eda603 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -591,7 +591,7 @@ inline te::Tensor dynamic_strided_slice1(const te::Tensor& x, const Array<PrimEx
   int64_t src_tensor_dim = x->shape.size();
   Array<PrimExpr> out_shape;
   for (int64_t i = 0; i < src_tensor_dim; ++i) {
-    out_shape.push_back(tvm::tir::Var("dim"));
+    out_shape.push_back(indexdiv(end[i] - begin[i], strides[i]));
   }
   return te::compute(
       out_shape,
diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 400495b99a08..588680fa83b9 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -260,7 +260,15 @@ def argwhere_2d(output_shape, condition):
         #     name="strided_slice_gpu_end0",
         #     tag="strided_slice_gpu_end0",
         # )
-        return dynamic_strided_slice1(out, [0, 1], [-1, -1], [1, 1])
+        out1 = dynamic_strided_slice1(out, [0, 1], [out.shape[0], 2], [1, 1])
+        out2 = sort_func(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+        out = adv_index(out, [out3])
+
+        out1 = dynamic_strided_slice1(out, [0, 0], [out.shape[0], 1], [1, 1])
+        out2 = sort_func(out1, axis=0, dtype="int32")
+        out3 = squeeze(out2)
+        return adv_index(out, [out3])
         # out1 = dynamic_strided_slice1(out, [0, 1], [-1, -1])
         # out1 = strided_slice(out, const_vector([0, 1]), end)
         # out2 = sort_func(out1, axis=0, dtype="int32")
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 467679a6570e..5b7a4cb507ad 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -43,7 +43,7 @@ def check_result(
     for kind in ["debug", "vm"]:
         targets = targets or tvm.testing.enabled_targets()
         for tgt, ctx in targets:
-            if "nvptx" in tgt:
+            if "cuda" not in tgt:
                 continue
             if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type):
                 continue
@@ -228,19 +228,21 @@ def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"):
 def test_any_argwhere():
     # verify_any_argwhere(any_dims(1), (5,))
     verify_any_argwhere(any_dims(2), (5, 5))
+    verify_any_argwhere(any_dims(2), (5, 5), "int32")
+    verify_any_argwhere(any_dims(2), (5, 5), "int8")
+    """
     verify_any_argwhere(any_dims(3), (5, 5, 5))
     verify_any_argwhere(any_dims(4), (5, 5, 5, 5))
     verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5))
     verify_any_argwhere(any_dims(1), (5,), "int32")
-    verify_any_argwhere(any_dims(2), (5, 5), "int32")
     verify_any_argwhere(any_dims(3), (5, 5, 5), "int32")
     verify_any_argwhere(any_dims(4), (5, 5, 5, 5), "int32")
     verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5), "int32")
     verify_any_argwhere(any_dims(1), (5,), "int8")
-    verify_any_argwhere(any_dims(2), (5, 5), "int8")
     verify_any_argwhere(any_dims(3), (5, 5, 5), "int8")
     verify_any_argwhere(any_dims(4), (5, 5, 5, 5), "int8")
     verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5), "int8")
+    """
 
 
 def verify_any_take(data_shape, indices_shape, axis, data_np_shape, indices_np_shape):
@@ -811,15 +813,10 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False):
         ref_out = sorted[0:kval]
 
     check_result(in_vals, mod, ref_out)
-<<<<<<< 18e10bfd52a220d3ec9addb5fe93b60dc8ab2cd0
 
 
 # TODO(kevinthesun): enable this test when Thrust is available in ci.
 # @tvm.testing.uses_gpu
-=======
-
-@tvm.testing.uses_gpu
->>>>>>> Fix dynamic strided_slice
 def test_any_topk():
     verify_any_topk(any_dims(1), 5, (10,), "float32")
     verify_any_topk(any_dims(2), 2, (6, 3), "int32")
@@ -1371,4 +1368,5 @@ def test_any_where():
 
 if __name__ == "__main__":
     #pytest.main([__file__])
-    test_any_topk()
+    #test_any_topk()
+    test_any_argwhere()

From 8c857663465314e8208bf4819637bd7b5563f0e1 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Wed, 2 Dec 2020 21:40:24 +0000
Subject: [PATCH 10/13] all tests pass

---
 python/tvm/topi/cuda/argwhere.py      | 106 ++++++++++----------------
 python/tvm/topi/transform.py          |   1 +
 src/tir/transforms/make_packed_api.cc |   1 -
 tests/python/relay/test_any.py        |   8 +-
 4 files changed, 45 insertions(+), 71 deletions(-)

diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 588680fa83b9..5dc6808e6af8 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -27,7 +27,6 @@
 from .sort import topk, topk_thrust, argsort, argsort_thrust
 from .. import tag
 from ..transform import strided_slice, adv_index, squeeze, dynamic_strided_slice1
-from ..utils import const_vector
 
 logger = logging.getLogger("topi")
 
@@ -47,17 +46,6 @@ def _get_sort_func(mode=0):
     return ret
 
 
-def _create_end(data, out, end):
-    ib = tvm.tir.ir_builder.create()
-    end = tvm.tir.const(end, dtype=out.dtype)
-    out_ptr = ib.buffer_ptr(out)
-    bx = te.thread_axis("blockIdx.x")
-    ib.scope_attr(bx, "thread_extent", 1)
-    out_ptr[0] = data.shape[0]
-    out_ptr[1] = end
-    return ib.get()
-
-
 def argwhere_1d_ir(condition, out):
     """Low level IR for argwhere 1D
 
@@ -247,19 +235,8 @@ def argwhere_2d(output_shape, condition):
         out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
 
-        return adv_index(out, [out3])
+        out = adv_index(out, [out3])
     else:
-        # out_shape = [2]
-        # out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
-        # end = te.extern(
-        #     [out_shape],
-        #     [out],
-        #     lambda ins, outs: _create_end(ins[0], outs[0], 2),
-        #     dtype="int32",
-        #     out_buffers=[out_buf],
-        #     name="strided_slice_gpu_end0",
-        #     tag="strided_slice_gpu_end0",
-        # )
         out1 = dynamic_strided_slice1(out, [0, 1], [out.shape[0], 2], [1, 1])
         out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
@@ -268,28 +245,8 @@ def argwhere_2d(output_shape, condition):
         out1 = dynamic_strided_slice1(out, [0, 0], [out.shape[0], 1], [1, 1])
         out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
-        return adv_index(out, [out3])
-        # out1 = dynamic_strided_slice1(out, [0, 1], [-1, -1])
-        # out1 = strided_slice(out, const_vector([0, 1]), end)
-        # out2 = sort_func(out1, axis=0, dtype="int32")
-        # out3 = squeeze(out2)
-        # out = adv_index(out, [out3])
-
-        # out_buf = tvm.tir.decl_buffer(out_shape, "int32", "strided_slice_out_buf")
-        # end = te.extern(
-        #     [out_shape],
-        #     [out],
-        #     lambda ins, outs: _create_end(ins[0], outs[0], 1),
-        #     dtype="int32",
-        #     out_buffers=[out_buf],
-        #     name="strided_slice_gpu_end1",
-        #     tag="strided_slice_gpu_end1",
-        # )
-        # out1 = strided_slice(out, const_vector([0, 0]), end)
-        # out2 = sort_func(out1, axis=0, dtype="int32")
-        # out3 = squeeze(out2)
-
-        # return adv_index(out, [out3])
+        out = adv_index(out, [out3])
+    return out
 
 
 def argwhere_3d_ir(condition, out):
@@ -382,18 +339,25 @@ def argwhere_3d(output_shape, condition):
         tag="argwhere3d_gpu",
     )
 
-    if out.shape[0] <= 1:
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
         return out
 
     # sort the output from the least significant to the most significant
     # column.
     sort_func = _get_sort_func(1)
-    for i in reversed(range(3)):
-        out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-        out = adv_index(out, [out3])
 
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
+        for i in reversed(range(3)):
+            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
+            out2 = sort_func(out1, axis=0, dtype="int32")
+            out3 = squeeze(out2)
+            out = adv_index(out, [out3])
+    else:
+        for i in reversed(range(3)):
+            out1 = dynamic_strided_slice1(out, [0, i], [out.shape[0], i + 1], [1, 1])
+            out2 = sort_func(out1, axis=0, dtype="int32")
+            out3 = squeeze(out2)
+            out = adv_index(out, [out3])
     return out
 
 
@@ -490,17 +454,24 @@ def argwhere_4d(output_shape, condition):
         tag="argwhere4d_gpu",
     )
 
-    if out.shape[0] <= 1:
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
         return out
 
     # sort the output from the least significant to the most significant
     # column.
     sort_func = _get_sort_func(1)
-    for i in reversed(range(4)):
-        out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-        out = adv_index(out, [out3])
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
+        for i in reversed(range(4)):
+            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
+            out2 = sort_func(out1, axis=0, dtype="int32")
+            out3 = squeeze(out2)
+            out = adv_index(out, [out3])
+    else:
+        for i in reversed(range(4)):
+            out1 = dynamic_strided_slice1(out, [0, i], [out.shape[0], i + 1], [1, 1])
+            out2 = sort_func(out1, axis=0, dtype="int32")
+            out3 = squeeze(out2)
+            out = adv_index(out, [out3])
 
     return out
 
@@ -601,17 +572,24 @@ def argwhere_5d(output_shape, condition):
         tag="argwhere5d_gpu",
     )
 
-    if out.shape[0] <= 1:
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1:
         return out
 
     # sort the output from the least significant to the most significant
     # column.
     sort_func = _get_sort_func(1)
-    for i in reversed(range(5)):
-        out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
-        out2 = sort_func(out1, axis=0, dtype="int32")
-        out3 = squeeze(out2)
-        out = adv_index(out, [out3])
+    if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)):
+        for i in reversed(range(5)):
+            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1])
+            out2 = sort_func(out1, axis=0, dtype="int32")
+            out3 = squeeze(out2)
+            out = adv_index(out, [out3])
+    else:
+        for i in reversed(range(5)):
+            out1 = dynamic_strided_slice1(out, [0, i], [out.shape[0], i + 1], [1, 1])
+            out2 = sort_func(out1, axis=0, dtype="int32")
+            out3 = squeeze(out2)
+            out = adv_index(out, [out3])
 
     return out
 
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 40d97ff21d26..7c82ef2da9bd 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -218,6 +218,7 @@ def strided_slice(a, begin, end, strides=None, slice_mode="end"):
         strides = []
     return cpp.strided_slice(a, begin, end, strides, slice_mode)
 
+
 def dynamic_strided_slice1(a, begin, end, strides):
     return cpp.dynamic_strided_slice1(a, begin, end, strides)
 
diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc
index 23756fb307de..7c4a8ef92724 100644
--- a/src/tir/transforms/make_packed_api.cc
+++ b/src/tir/transforms/make_packed_api.cc
@@ -46,7 +46,6 @@ inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
 }
 
 PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) {
-  LOG(INFO) << AsText(func, false);
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute";
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 5b7a4cb507ad..6f53fbb30584 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -226,11 +226,10 @@ def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"):
 
 @tvm.testing.uses_gpu
 def test_any_argwhere():
-    # verify_any_argwhere(any_dims(1), (5,))
+    verify_any_argwhere(any_dims(1), (5,))
     verify_any_argwhere(any_dims(2), (5, 5))
     verify_any_argwhere(any_dims(2), (5, 5), "int32")
     verify_any_argwhere(any_dims(2), (5, 5), "int8")
-    """
     verify_any_argwhere(any_dims(3), (5, 5, 5))
     verify_any_argwhere(any_dims(4), (5, 5, 5, 5))
     verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5))
@@ -242,7 +241,6 @@ def test_any_argwhere():
     verify_any_argwhere(any_dims(3), (5, 5, 5), "int8")
     verify_any_argwhere(any_dims(4), (5, 5, 5, 5), "int8")
     verify_any_argwhere(any_dims(5), (5, 5, 5, 5, 5), "int8")
-    """
 
 
 def verify_any_take(data_shape, indices_shape, axis, data_np_shape, indices_np_shape):
@@ -1367,6 +1365,4 @@ def test_any_where():
 
 
 if __name__ == "__main__":
-    #pytest.main([__file__])
-    #test_any_topk()
-    test_any_argwhere()
+    pytest.main([__file__])

From 736dfe7ba507f3c852d0f55495d60c54e57674d5 Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Wed, 2 Dec 2020 21:42:08 +0000
Subject: [PATCH 11/13] remove print

---
 tests/python/topi/python/test_topi_argwhere.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index 21dc6a2864ee..69993d287b79 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -52,14 +52,11 @@ def check_device(device, ctx):
 
         func = tvm.build(sch, [out_shape, condition, out], device, name="argwhere")
 
-        print(func.imported_modules[0].get_source())
-
         args = [tvm.nd.array(np_shape, ctx)]
         args.append(tvm.nd.array(np_data, ctx))
         args.append(tvm.nd.empty(out.shape, ctx=ctx, dtype=condition.dtype))
         func(*args)
         np.set_printoptions(threshold=np.inf)
-        # print(args[-1].asnumpy())
         tvm.testing.assert_allclose(args[-1].asnumpy(), np.array(np_out))
 
     for target, ctx in tvm.testing.enabled_targets():

From ea8ba0cd88ee9e4347868b1bb13438b356e55f2d Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 4 Dec 2020 05:57:58 +0000
Subject: [PATCH 12/13] use new strided_slice

---
 include/tvm/topi/transform.h                  | 22 -------------------
 python/tvm/topi/cuda/argwhere.py              | 12 +++++-----
 python/tvm/topi/cuda/sort.py                  |  4 +---
 python/tvm/topi/transform.py                  |  4 ----
 src/topi/transform.cc                         |  4 ----
 tests/python/relay/test_any.py                |  4 +++-
 .../python/topi/python/test_topi_argwhere.py  |  4 +++-
 7 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index b5b0c4eda603..a04762f28feb 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -584,28 +584,6 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
       name, tag);
 }
 
-inline te::Tensor dynamic_strided_slice1(const te::Tensor& x, const Array<PrimExpr>& begin,
-                                         const Array<PrimExpr>& end, const Array<PrimExpr>& strides,
-                                         std::string name = "T_strided_slice_dynamic",
-                                         std::string tag = topi::kInjective) {
-  int64_t src_tensor_dim = x->shape.size();
-  Array<PrimExpr> out_shape;
-  for (int64_t i = 0; i < src_tensor_dim; ++i) {
-    out_shape.push_back(indexdiv(end[i] - begin[i], strides[i]));
-  }
-  return te::compute(
-      out_shape,
-      [&](const Array<tvm::tir::Var>& indices) {
-        Array<PrimExpr> real_indices;
-        for (int32_t i = 0; i < src_tensor_dim; ++i) {
-          real_indices.push_back(indices[i] * strides[i] + begin[i]);
-        }
-        return x(real_indices);
-      },
-      name, tag);
-}
-
-
 /*!
  * \brief strided_slice of a tensor
  *
diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py
index 5dc6808e6af8..e39004dc76a9 100644
--- a/python/tvm/topi/cuda/argwhere.py
+++ b/python/tvm/topi/cuda/argwhere.py
@@ -26,7 +26,7 @@
 from .nms import atomic_add
 from .sort import topk, topk_thrust, argsort, argsort_thrust
 from .. import tag
-from ..transform import strided_slice, adv_index, squeeze, dynamic_strided_slice1
+from ..transform import strided_slice, adv_index, squeeze
 
 logger = logging.getLogger("topi")
 
@@ -237,12 +237,12 @@ def argwhere_2d(output_shape, condition):
 
         out = adv_index(out, [out3])
     else:
-        out1 = dynamic_strided_slice1(out, [0, 1], [out.shape[0], 2], [1, 1])
+        out1 = strided_slice(out, [0, 1], [out.shape[0], 2], [1, 1])
         out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
         out = adv_index(out, [out3])
 
-        out1 = dynamic_strided_slice1(out, [0, 0], [out.shape[0], 1], [1, 1])
+        out1 = strided_slice(out, [0, 0], [out.shape[0], 1], [1, 1])
         out2 = sort_func(out1, axis=0, dtype="int32")
         out3 = squeeze(out2)
         out = adv_index(out, [out3])
@@ -354,7 +354,7 @@ def argwhere_3d(output_shape, condition):
             out = adv_index(out, [out3])
     else:
         for i in reversed(range(3)):
-            out1 = dynamic_strided_slice1(out, [0, i], [out.shape[0], i + 1], [1, 1])
+            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1])
             out2 = sort_func(out1, axis=0, dtype="int32")
             out3 = squeeze(out2)
             out = adv_index(out, [out3])
@@ -468,7 +468,7 @@ def argwhere_4d(output_shape, condition):
             out = adv_index(out, [out3])
     else:
         for i in reversed(range(4)):
-            out1 = dynamic_strided_slice1(out, [0, i], [out.shape[0], i + 1], [1, 1])
+            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1])
             out2 = sort_func(out1, axis=0, dtype="int32")
             out3 = squeeze(out2)
             out = adv_index(out, [out3])
@@ -586,7 +586,7 @@ def argwhere_5d(output_shape, condition):
             out = adv_index(out, [out3])
     else:
         for i in reversed(range(5)):
-            out1 = dynamic_strided_slice1(out, [0, i], [out.shape[0], i + 1], [1, 1])
+            out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1])
             out2 = sort_func(out1, axis=0, dtype="int32")
             out3 = squeeze(out2)
             out = adv_index(out, [out3])
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 537b50710abf..2a7f4eb92daa 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -21,9 +21,8 @@
 
 from .injective import schedule_injective_from_existing
 from ..math import identity
-from ..transform import strided_slice, transpose, dynamic_strided_slice1
+from ..transform import strided_slice, transpose
 from .. import tag
-from ..tensor import full
 
 
 def swap(arr, axis):
@@ -456,7 +455,6 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
     out : tvm.te.Tensor or List[tvm.te.Tensor]
         The computed result.
     """
-    return topk_thrust(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64")
     assert ret_type in ["both", "values", "indices"]
     ndim = len(data.shape)
     axis = axis + ndim if axis < 0 else axis
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 7c82ef2da9bd..6ddbc73e4666 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -219,10 +219,6 @@ def strided_slice(a, begin, end, strides=None, slice_mode="end"):
     return cpp.strided_slice(a, begin, end, strides, slice_mode)
 
 
-def dynamic_strided_slice1(a, begin, end, strides):
-    return cpp.dynamic_strided_slice1(a, begin, end, strides)
-
-
 @tvm.te.tag_scope(tag=tag.INJECTIVE + ",strided_set")
 def strided_set(a, v, begin, end, strides=None):
     """Set slice of an array.
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index d61790fb1091..e1e3988f6400 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -173,10 +173,6 @@ TVM_REGISTER_GLOBAL("topi.dynamic_strided_slice").set_body([](TVMArgs args, TVMR
   *rv = dynamic_strided_slice(args[0], args[1], args[2], args[3]);
 });
 
-TVM_REGISTER_GLOBAL("topi.dynamic_strided_slice1").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = dynamic_strided_slice1(args[0], args[1], args[2], args[3]);
-});
-
 TVM_REGISTER_GLOBAL("topi.one_hot").set_body([](TVMArgs args, TVMRetValue* rv) {
   int depth = args[3];
   int axis = args[4];
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 6f53fbb30584..df7bd6d09e15 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -224,7 +224,9 @@ def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"):
     check_result([data], mod, expected, flatten=True)
 
 
-@tvm.testing.uses_gpu
+# TODO(zhiics) Enable argwhere gpu test after sort is fixed. Otherwise, we have
+# to use thrust to guarantee the correct results which has been tested locally.
+# @tvm.testing.uses_gpu
 def test_any_argwhere():
     verify_any_argwhere(any_dims(1), (5,))
     verify_any_argwhere(any_dims(2), (5, 5))
diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index 69993d287b79..5cb7cd44513e 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -63,7 +63,9 @@ def check_device(device, ctx):
         check_device(target, ctx)
 
 
-@tvm.testing.uses_gpu
+# TODO(zhiics) Enable argwhere gpu test after sort is fixed. Otherwise, we have
+# to use thrust to guarantee the correct results which has been tested locally.
+# @tvm.testing.uses_gpu
 def test_argwhere():
     verify_argwhere((1,))
     verify_argwhere((100,))

From 0a7725025a15b7a66b1159ff7730fadd5182a47d Mon Sep 17 00:00:00 2001
From: Zhi Chen <chzhi@amazon.com>
Date: Fri, 4 Dec 2020 17:53:06 +0000
Subject: [PATCH 13/13] clean

---
 tests/python/relay/test_any.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index df7bd6d09e15..ddf8e980706b 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -43,8 +43,6 @@ def check_result(
     for kind in ["debug", "vm"]:
         targets = targets or tvm.testing.enabled_targets()
         for tgt, ctx in targets:
-            if "cuda" not in tgt:
-                continue
             if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type):
                 continue
             ex = relay.create_executor(kind, mod=mod, ctx=ctx, target=tgt)