From 560c8da96fa48a8d0b6b573883e885242da96374 Mon Sep 17 00:00:00 2001
From: hanqingchang <hanqingchang@kuaishou.com>
Date: Thu, 12 Jan 2023 16:54:57 +0800
Subject: [PATCH 1/6] feat: combine cutlass and ansor

---
 .../tvm/auto_scheduler/relay_integration.py   |   4 +-
 tests/python/contrib/test_cutlass_ansor.py    | 259 ++++++++++++++++++
 2 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/contrib/test_cutlass_ansor.py

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 52c7f44fcede..30e268473037 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -84,6 +84,7 @@ def extract_tasks(
     include_simple_tasks=False,
     dump_workload_to_dag_log=None,
     opt_level=3,
+    other_targets=[],
 ):
     """Extract tuning tasks from a relay program.
 
@@ -125,12 +126,13 @@ def extract_tasks(
     old_verbose = dispatch_ctx.verbose
     dispatch_ctx.verbose = 0
 
+    targets = [target] + other_targets
     errors = []
     with env:
         # Wrap build call in a new thread to avoid the conflict
         # between python's multiprocessing and tvm's thread pool
         build_thread = threading.Thread(
-            target=call_all_topi_funcs, args=(mod, params, target, errors, opt_level)
+            target=call_all_topi_funcs, args=(mod, params, targets, errors, opt_level)
         )
         build_thread.start()
         build_thread.join()
diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py
new file mode 100644
index 000000000000..24fd25f350d8
--- /dev/null
+++ b/tests/python/contrib/test_cutlass_ansor.py
@@ -0,0 +1,259 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import math
+import tvm
+from tvm import relay
+from tvm.contrib.cudnn import conv_output_shape
+import numpy as np
+from tvm.runtime.vm import VirtualMachine
+from tvm.relay import op as _op
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType
+from tvm import auto_scheduler
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+import tvm.testing
+
+logging.basicConfig(level=logging.INFO)
+
+
+def get_ref_rt_mod(mod, params, target="cuda"):
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params=params)
+    dev = tvm.device(target, 0)
+    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+    return rt_mod, dev
+
+
+def get_random_ndarray(shape, dtype):
+    if dtype == "int8":
+        return np.random.randint(-128, 128, shape).astype(dtype)
+    elif dtype == "uint8":
+        return np.random.randint(0, 256, shape).astype(dtype)
+    return np.random.uniform(-1, 1, shape).astype(dtype)
+
+
+def get_output(rt_mod, names, inputs):
+    for name, inp in zip(names, inputs):
+        rt_mod.set_input(name, inp)
+    rt_mod.run()
+    return rt_mod.get_output(0).numpy()
+
+
+def get_dense_transpose_dense(M, N, K, dtype="float16"):
+    """
+    dense: [M, K] * [N, K] -> [M, N]
+    transpose: [M, N] -> [N, M]
+    dense: [N, M] * [K, M] -> [N, K]
+
+    input: [M, K]
+    weight0: [N, K]
+    weight1: [K, M]
+    """
+    in_shape = (M, K)
+    w0_shape = (N, K)
+    w1_shape = (K, M)
+
+    input = relay.var("input", shape=in_shape, dtype=dtype)
+    w0 = relay.var("weight0", shape=w0_shape, dtype=dtype)
+    w1 = relay.var("weight1", shape=w1_shape, dtype=dtype)
+
+    one = _op.const(1, dtype=dtype)
+    two = _op.const(2, dtype=dtype)
+    
+    out0 = relay.nn.dense(input, w0, out_dtype=dtype)
+    input1 = _op.transpose(out0, axes=(1, 0))
+    out1 = relay.nn.dense(input1, w1, out_dtype=dtype)
+    return out1
+
+
+def build_by_cutlass(
+    mod,
+    params,
+    sm,
+    split_k_slices=[1],
+    tmp_dir="./tmp",
+    use_fast_math=False,
+    use_3xtf32=True,
+):
+    logging.info("before partitioning:\n%s", mod)
+    mod = partition_for_cutlass(mod)
+    logging.info("after partitioning:\n%s", mod)
+
+    num_cutlass_partition = num_cutlass_partitions(mod)
+    host = tvm.target.Target("llvm")
+    cuda = tvm.target.Target("cuda", host=host)
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": sm,
+            "use_3xtf32": use_3xtf32,
+            "split_k_slices": split_k_slices,
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": use_fast_math,
+            "tmp_dir": tmp_dir,
+        },
+        host=host,
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=[cuda, cutlass], params=params)
+    lib = finalize_modules(lib, "compile.so", tmp_dir)
+    dev = tvm.device("cuda", 0)
+    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+    return rt_mod, dev, num_cutlass_partition
+
+
+def build_by_cutlass_ansor(
+    mod,
+    params,
+    sm,
+    split_k_slices=[1],
+    tmp_dir="./tmp",
+    use_fast_math=False,
+    use_3xtf32=True,
+    num_trials=10,
+):
+    logging.info("before partitioning:\n%s", mod)
+    mod = partition_for_cutlass(mod)
+    logging.info("after partitioning:\n%s", mod)
+
+    num_cutlass_partition = num_cutlass_partitions(mod)
+    host = tvm.target.Target("llvm")
+    cuda = tvm.target.Target("cuda", host=host)
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": sm,
+            "use_3xtf32": use_3xtf32,
+            "split_k_slices": split_k_slices,
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": use_fast_math,
+            "tmp_dir": tmp_dir,
+        },
+        host=host,
+    )
+
+    # extract tasks
+    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        tasks, task_weights = auto_scheduler.extract_tasks(
+                mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass])
+    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+        print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
+        print(task.compute_dag)
+
+    # auto-tuning
+    log_file = "cutlass_ansor.log"
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(
+        repeat=3, min_repeat_ms=200, timeout=10
+    )
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tuner.tune(
+        auto_scheduler.TuningOptions(
+            num_measure_trials=num_trials,
+            runner=measure_ctx.runner,
+            measure_callbacks=[
+                auto_scheduler.RecordToFile(log_file),
+            ],
+        )
+    )
+
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_auto_scheduler": True},
+        ):
+            lib = relay.build(
+                mod,
+                target=cuda,
+                target_host=host,
+                params=params,
+            )
+    lib = finalize_modules(lib, "compile.so", tmp_dir)
+    dev = tvm.device("cuda", 0)
+    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+    return rt_mod, dev, num_cutlass_partition
+
+
+def verify_dense_transpose_dense(
+    func,
+    M,
+    N,
+    K,
+    ref_target="cuda",
+    sm=80,
+    atol=1e-5,
+    rtol=1e-5,
+    run_benchmark=False,
+    dtype="float16",
+    use_3xtf32=True,
+):
+    assert has_cutlass()
+    if sm < 80 and dtype == "float32":
+        return
+
+    mod = tvm.IRModule.from_expr(func)
+    typ = relay.transform.InferType()(mod)["main"].body.checked_type
+    np_data = get_random_ndarray((M, K), dtype)
+    np_weight0 = get_random_ndarray((N, K), dtype)
+    np_weight1 = get_random_ndarray((K, M), dtype)
+
+    params = {"weight0": np_weight0, "weight1": np_weight1}
+
+    rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
+    cutlass_rt_mod, dev, num_partition = build_by_cutlass(mod, params, sm, use_3xtf32=use_3xtf32)
+    cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor(mod, params, sm, use_3xtf32=use_3xtf32)
+    x = tvm.nd.array(np_data, device=dev)
+    cutlass_out = get_output(cutlass_rt_mod, ["input"], [x])
+    cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x])
+    ref_out = get_output(rt_mod_ref, ["input"], [x])
+
+    assert num_partition > 0
+    np.testing.assert_allclose(cutlass_out, ref_out, atol=atol, rtol=rtol)
+    np.testing.assert_allclose(cutlass_ansor_out, ref_out, atol=atol, rtol=rtol)
+
+    if run_benchmark:
+        print("CUTLASS:", cutlass_rt_mod.benchmark(dev, number=1, repeat=600))
+        print("CUTLASS with Ansor:", cutlass_ansor_rt_mod.benchmark(dev, number=1, repeat=600))
+        print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600))
+
+
+M = 128
+N = 128
+K = 128
+
+# Use larger M/N/K for significant performance improvement
+# M = 1024
+# N = 1024
+# K = 1024
+
+
+@tvm.testing.requires_cutlass
+def test_dense_transpose_dense():
+    verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K, sm=75, run_benchmark=True)
+
+
+if __name__ == "__main__":
+     tvm.testing.main()

From d43f1796ff3b095927f0bbfbcd3d53b4a145edd3 Mon Sep 17 00:00:00 2001
From: hanqingchang <hanqingchang@kuaishou.com>
Date: Tue, 31 Jan 2023 11:26:34 +0800
Subject: [PATCH 2/6] use sm80 and disable run_benchmark

---
 tests/python/contrib/test_cutlass_ansor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py
index 24fd25f350d8..92bbd64c99c4 100644
--- a/tests/python/contrib/test_cutlass_ansor.py
+++ b/tests/python/contrib/test_cutlass_ansor.py
@@ -252,7 +252,7 @@ def verify_dense_transpose_dense(
 
 @tvm.testing.requires_cutlass
 def test_dense_transpose_dense():
-    verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K, sm=75, run_benchmark=True)
+    verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K)
 
 
 if __name__ == "__main__":

From b86e9d9b9e8157ba7d74ec61b6ed10deb61885ca Mon Sep 17 00:00:00 2001
From: hanqingchang <hanqingchang@kuaishou.com>
Date: Tue, 31 Jan 2023 11:55:12 +0800
Subject: [PATCH 3/6] fix lint error

---
 tests/python/contrib/test_cutlass_ansor.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py
index 92bbd64c99c4..ca14b0a32101 100644
--- a/tests/python/contrib/test_cutlass_ansor.py
+++ b/tests/python/contrib/test_cutlass_ansor.py
@@ -77,9 +77,6 @@ def get_dense_transpose_dense(M, N, K, dtype="float16"):
     w0 = relay.var("weight0", shape=w0_shape, dtype=dtype)
     w1 = relay.var("weight1", shape=w1_shape, dtype=dtype)
 
-    one = _op.const(1, dtype=dtype)
-    two = _op.const(2, dtype=dtype)
-    
     out0 = relay.nn.dense(input, w0, out_dtype=dtype)
     input1 = _op.transpose(out0, axes=(1, 0))
     out1 = relay.nn.dense(input1, w1, out_dtype=dtype)
@@ -159,16 +156,15 @@ def build_by_cutlass_ansor(
     # extract tasks
     with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
         tasks, task_weights = auto_scheduler.extract_tasks(
-                mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass])
+            mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass]
+        )
     for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
         print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
         print(task.compute_dag)
 
     # auto-tuning
     log_file = "cutlass_ansor.log"
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(
-        repeat=3, min_repeat_ms=200, timeout=10
-    )
+    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10)
     tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
     tuner.tune(
         auto_scheduler.TuningOptions(
@@ -224,7 +220,9 @@ def verify_dense_transpose_dense(
 
     rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
     cutlass_rt_mod, dev, num_partition = build_by_cutlass(mod, params, sm, use_3xtf32=use_3xtf32)
-    cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor(mod, params, sm, use_3xtf32=use_3xtf32)
+    cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor(
+        mod, params, sm, use_3xtf32=use_3xtf32
+    )
     x = tvm.nd.array(np_data, device=dev)
     cutlass_out = get_output(cutlass_rt_mod, ["input"], [x])
     cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x])
@@ -256,4 +254,4 @@ def test_dense_transpose_dense():
 
 
 if __name__ == "__main__":
-     tvm.testing.main()
+    tvm.testing.main()

From 639ce62d20fe9eaac6a80f0ca057ad1f3bf44e1e Mon Sep 17 00:00:00 2001
From: hanqingchang <hanqingchang@kuaishou.com>
Date: Tue, 31 Jan 2023 12:59:22 +0800
Subject: [PATCH 4/6] use tempfile; fix dangerous default value

---
 .../tvm/auto_scheduler/relay_integration.py   |  8 ++-
 tests/python/contrib/test_cutlass_ansor.py    | 51 +++++++++----------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 30e268473037..973cbf19bece 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -84,7 +84,7 @@ def extract_tasks(
     include_simple_tasks=False,
     dump_workload_to_dag_log=None,
     opt_level=3,
-    other_targets=[],
+    other_targets=None,
 ):
     """Extract tuning tasks from a relay program.
 
@@ -106,6 +106,8 @@ def extract_tasks(
         A file to dump an association between the workload keys and the actual DAG
     opt_level : Optional[int]
         The optimization level of the task extractions.
+    other_targets: Optional[List[tvm.target.Target]]
+        Other targets for call_all_topi_funcs, e.g., cutlass target.
 
     Returns
     -------
@@ -126,7 +128,9 @@ def extract_tasks(
     old_verbose = dispatch_ctx.verbose
     dispatch_ctx.verbose = 0
 
-    targets = [target] + other_targets
+    targets = [target]
+    if other_targets is not None:
+        targets += other_targets
     errors = []
     with env:
         # Wrap build call in a new thread to avoid the conflict
diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py
index ca14b0a32101..931c9b28b287 100644
--- a/tests/python/contrib/test_cutlass_ansor.py
+++ b/tests/python/contrib/test_cutlass_ansor.py
@@ -15,21 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 import logging
-import math
+import tempfile
 import tvm
 from tvm import relay
-from tvm.contrib.cudnn import conv_output_shape
 import numpy as np
-from tvm.runtime.vm import VirtualMachine
 from tvm.relay import op as _op
 from tvm.relay.op.contrib.cutlass import partition_for_cutlass
-from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType
 from tvm import auto_scheduler
 from tvm.contrib.cutlass import (
     has_cutlass,
     num_cutlass_partitions,
     finalize_modules,
-    finalize_modules_vm,
 )
 import tvm.testing
 
@@ -163,30 +159,31 @@ def build_by_cutlass_ansor(
         print(task.compute_dag)
 
     # auto-tuning
-    log_file = "cutlass_ansor.log"
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10)
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-    tuner.tune(
-        auto_scheduler.TuningOptions(
-            num_measure_trials=num_trials,
-            runner=measure_ctx.runner,
-            measure_callbacks=[
-                auto_scheduler.RecordToFile(log_file),
-            ],
+    with tempfile.NamedTemporaryFile() as fp:
+        log_file = fp.name
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10)
+        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+        tuner.tune(
+            auto_scheduler.TuningOptions(
+                num_measure_trials=num_trials,
+                runner=measure_ctx.runner,
+                measure_callbacks=[
+                    auto_scheduler.RecordToFile(log_file),
+                ],
+            )
         )
-    )
 
-    with auto_scheduler.ApplyHistoryBest(log_file):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_auto_scheduler": True},
-        ):
-            lib = relay.build(
-                mod,
-                target=cuda,
-                target_host=host,
-                params=params,
-            )
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(
+                opt_level=3,
+                config={"relay.backend.use_auto_scheduler": True},
+            ):
+                lib = relay.build(
+                    mod,
+                    target=cuda,
+                    target_host=host,
+                    params=params,
+                )
     lib = finalize_modules(lib, "compile.so", tmp_dir)
     dev = tvm.device("cuda", 0)
     rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))

From 1096f9417dac25a8482e320a0dedb741682f279f Mon Sep 17 00:00:00 2001
From: hanqingchang <hanqingchang@kuaishou.com>
Date: Wed, 1 Feb 2023 14:22:42 +0800
Subject: [PATCH 5/6] merge cutlass_ansor test into test_cutlass.py

---
 tests/python/contrib/test_cutlass.py       | 126 +++++++++-
 tests/python/contrib/test_cutlass_ansor.py | 254 ---------------------
 2 files changed, 124 insertions(+), 256 deletions(-)
 delete mode 100644 tests/python/contrib/test_cutlass_ansor.py

diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
index 753ee178f9d3..c52a22202df7 100644
--- a/tests/python/contrib/test_cutlass.py
+++ b/tests/python/contrib/test_cutlass.py
@@ -15,13 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 import logging
+import tempfile
 import math
 import tvm
 from tvm import relay
 from tvm.contrib.cudnn import conv_output_shape
 import numpy as np
+from tvm.relay import op as _op
 from tvm.runtime.vm import VirtualMachine
 from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm import auto_scheduler
 from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType
 from tvm.contrib.cutlass import (
     has_cutlass,
@@ -235,6 +238,32 @@ def get_conv2d_backward_weight(
     )
 
 
+def get_dense_transpose_dense(M, N, K, dtype="float16"):
+    """
+    output = nn.dense(_op.transpose(nn.dense(input, weight0), axes=(1, 0)), weight1)
+
+    dense0: [M, K] * [N, K] -> [M, N]
+    transpose: [M, N] -> [N, M]
+    dense1: [N, M] * [K, M] -> [N, K]
+
+    input: [M, K]
+    weight0: [N, K]
+    weight1: [K, M]
+    """
+    input_shape = (M, K)
+    weight0_shape = (N, K)
+    weight1_shape = (K, M)
+
+    input = relay.var("input", shape=input_shape, dtype=dtype)
+    weight0 = relay.var("weight0", shape=weight0_shape, dtype=dtype)
+    weight1 = relay.var("weight1", shape=weight1_shape, dtype=dtype)
+
+    output0 = relay.nn.dense(input, weight0, out_dtype=dtype)
+    input1 = _op.transpose(output0, axes=(1, 0))
+    output = relay.nn.dense(input1, weight1, out_dtype=dtype)
+    return output
+
+
 def convert_conv2d_layout(mod, desired_layouts):
     with tvm.transform.PassContext(opt_level=3):
         seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
@@ -257,6 +286,8 @@ def profile_and_build(
     tmp_dir="./tmp",
     use_fast_math=False,
     use_3xtf32=True,
+    use_ansor=False,
+    ansor_tuning=False,
 ):
     logging.info("before partitioning:\n%s", mod)
     mod = partition_for_cutlass(mod)
@@ -279,8 +310,47 @@ def profile_and_build(
         },
         host=host,
     )
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=[cuda, cutlass], params=params)
+
+    if use_ansor:
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            tasks, task_weights = auto_scheduler.extract_tasks(
+                mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass]
+            )
+        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+            logging.info(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
+            logging.info(task.compute_dag)
+
+        with tempfile.NamedTemporaryFile() as fp:
+            log_file = fp.name
+
+            # auto-tuning is disabled by default
+            if ansor_tuning:
+                measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10)
+                tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+                tuner.tune(
+                    auto_scheduler.TuningOptions(
+                        num_measure_trials=100,
+                        runner=measure_ctx.runner,
+                        measure_callbacks=[
+                            auto_scheduler.RecordToFile(log_file),
+                        ],
+                    )
+                )
+
+            with auto_scheduler.ApplyHistoryBest(log_file):
+                with tvm.transform.PassContext(
+                    opt_level=3,
+                    config={"relay.backend.use_auto_scheduler": True},
+                ):
+                    lib = relay.build(
+                        mod,
+                        target=cuda,
+                        target_host=host,
+                        params=params,
+                    )
+    else:
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=[cuda, cutlass], params=params)
     lib = finalize_modules(lib, "compile.so", tmp_dir)
     dev = tvm.device("cuda", 0)
     rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
@@ -959,5 +1029,57 @@ def test_conv2d_bwd():
     )
 
 
+def verify_dense_transpose_dense(
+    func,
+    M,
+    N,
+    K,
+    ref_target="cuda",
+    sm=80,
+    atol=1e-5,
+    rtol=1e-5,
+    run_benchmark=False,
+    dtype="float16",
+    use_3xtf32=True,
+):
+    assert has_cutlass()
+    if sm < 80 and dtype == "float32":
+        return
+
+    mod = tvm.IRModule.from_expr(func)
+    typ = relay.transform.InferType()(mod)["main"].body.checked_type
+    np_data = get_random_ndarray((M, K), dtype)
+    np_weight0 = get_random_ndarray((N, K), dtype)
+    np_weight1 = get_random_ndarray((K, M), dtype)
+
+    params = {"weight0": np_weight0, "weight1": np_weight1}
+
+    rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
+    cutlass_rt_mod, dev, num_partition = profile_and_build(
+        mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=False,
+    )
+    cutlass_ansor_rt_mod, dev, num_partition = profile_and_build(
+        mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=True,
+    )
+    x = tvm.nd.array(np_data, device=dev)
+    cutlass_out = get_output(cutlass_rt_mod, ["input"], [x])
+    cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x])
+    ref_out = get_output(rt_mod_ref, ["input"], [x])
+
+    assert num_partition > 0
+    np.testing.assert_allclose(cutlass_out, ref_out, atol=atol, rtol=rtol)
+    np.testing.assert_allclose(cutlass_ansor_out, ref_out, atol=atol, rtol=rtol)
+
+    if run_benchmark:
+        print("CUTLASS:", cutlass_rt_mod.benchmark(dev, number=1, repeat=600))
+        print("CUTLASS with Ansor:", cutlass_ansor_rt_mod.benchmark(dev, number=1, repeat=600))
+        print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600))
+
+
+@tvm.testing.requires_cutlass
+def test_dense_transpose_dense():
+    verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py
deleted file mode 100644
index 931c9b28b287..000000000000
--- a/tests/python/contrib/test_cutlass_ansor.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import logging
-import tempfile
-import tvm
-from tvm import relay
-import numpy as np
-from tvm.relay import op as _op
-from tvm.relay.op.contrib.cutlass import partition_for_cutlass
-from tvm import auto_scheduler
-from tvm.contrib.cutlass import (
-    has_cutlass,
-    num_cutlass_partitions,
-    finalize_modules,
-)
-import tvm.testing
-
-logging.basicConfig(level=logging.INFO)
-
-
-def get_ref_rt_mod(mod, params, target="cuda"):
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, params=params)
-    dev = tvm.device(target, 0)
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    return rt_mod, dev
-
-
-def get_random_ndarray(shape, dtype):
-    if dtype == "int8":
-        return np.random.randint(-128, 128, shape).astype(dtype)
-    elif dtype == "uint8":
-        return np.random.randint(0, 256, shape).astype(dtype)
-    return np.random.uniform(-1, 1, shape).astype(dtype)
-
-
-def get_output(rt_mod, names, inputs):
-    for name, inp in zip(names, inputs):
-        rt_mod.set_input(name, inp)
-    rt_mod.run()
-    return rt_mod.get_output(0).numpy()
-
-
-def get_dense_transpose_dense(M, N, K, dtype="float16"):
-    """
-    dense: [M, K] * [N, K] -> [M, N]
-    transpose: [M, N] -> [N, M]
-    dense: [N, M] * [K, M] -> [N, K]
-
-    input: [M, K]
-    weight0: [N, K]
-    weight1: [K, M]
-    """
-    in_shape = (M, K)
-    w0_shape = (N, K)
-    w1_shape = (K, M)
-
-    input = relay.var("input", shape=in_shape, dtype=dtype)
-    w0 = relay.var("weight0", shape=w0_shape, dtype=dtype)
-    w1 = relay.var("weight1", shape=w1_shape, dtype=dtype)
-
-    out0 = relay.nn.dense(input, w0, out_dtype=dtype)
-    input1 = _op.transpose(out0, axes=(1, 0))
-    out1 = relay.nn.dense(input1, w1, out_dtype=dtype)
-    return out1
-
-
-def build_by_cutlass(
-    mod,
-    params,
-    sm,
-    split_k_slices=[1],
-    tmp_dir="./tmp",
-    use_fast_math=False,
-    use_3xtf32=True,
-):
-    logging.info("before partitioning:\n%s", mod)
-    mod = partition_for_cutlass(mod)
-    logging.info("after partitioning:\n%s", mod)
-
-    num_cutlass_partition = num_cutlass_partitions(mod)
-    host = tvm.target.Target("llvm")
-    cuda = tvm.target.Target("cuda", host=host)
-    cutlass = tvm.target.Target(
-        {
-            "kind": "cutlass",
-            "sm": sm,
-            "use_3xtf32": use_3xtf32,
-            "split_k_slices": split_k_slices,
-            "profile_all_alignments": False,
-            "find_first_valid": True,
-            "use_multiprocessing": True,
-            "use_fast_math": use_fast_math,
-            "tmp_dir": tmp_dir,
-        },
-        host=host,
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=[cuda, cutlass], params=params)
-    lib = finalize_modules(lib, "compile.so", tmp_dir)
-    dev = tvm.device("cuda", 0)
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    return rt_mod, dev, num_cutlass_partition
-
-
-def build_by_cutlass_ansor(
-    mod,
-    params,
-    sm,
-    split_k_slices=[1],
-    tmp_dir="./tmp",
-    use_fast_math=False,
-    use_3xtf32=True,
-    num_trials=10,
-):
-    logging.info("before partitioning:\n%s", mod)
-    mod = partition_for_cutlass(mod)
-    logging.info("after partitioning:\n%s", mod)
-
-    num_cutlass_partition = num_cutlass_partitions(mod)
-    host = tvm.target.Target("llvm")
-    cuda = tvm.target.Target("cuda", host=host)
-    cutlass = tvm.target.Target(
-        {
-            "kind": "cutlass",
-            "sm": sm,
-            "use_3xtf32": use_3xtf32,
-            "split_k_slices": split_k_slices,
-            "profile_all_alignments": False,
-            "find_first_valid": True,
-            "use_multiprocessing": True,
-            "use_fast_math": use_fast_math,
-            "tmp_dir": tmp_dir,
-        },
-        host=host,
-    )
-
-    # extract tasks
-    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
-        tasks, task_weights = auto_scheduler.extract_tasks(
-            mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass]
-        )
-    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-        print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
-        print(task.compute_dag)
-
-    # auto-tuning
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-        measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10)
-        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-        tuner.tune(
-            auto_scheduler.TuningOptions(
-                num_measure_trials=num_trials,
-                runner=measure_ctx.runner,
-                measure_callbacks=[
-                    auto_scheduler.RecordToFile(log_file),
-                ],
-            )
-        )
-
-        with auto_scheduler.ApplyHistoryBest(log_file):
-            with tvm.transform.PassContext(
-                opt_level=3,
-                config={"relay.backend.use_auto_scheduler": True},
-            ):
-                lib = relay.build(
-                    mod,
-                    target=cuda,
-                    target_host=host,
-                    params=params,
-                )
-    lib = finalize_modules(lib, "compile.so", tmp_dir)
-    dev = tvm.device("cuda", 0)
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    return rt_mod, dev, num_cutlass_partition
-
-
-def verify_dense_transpose_dense(
-    func,
-    M,
-    N,
-    K,
-    ref_target="cuda",
-    sm=80,
-    atol=1e-5,
-    rtol=1e-5,
-    run_benchmark=False,
-    dtype="float16",
-    use_3xtf32=True,
-):
-    assert has_cutlass()
-    if sm < 80 and dtype == "float32":
-        return
-
-    mod = tvm.IRModule.from_expr(func)
-    typ = relay.transform.InferType()(mod)["main"].body.checked_type
-    np_data = get_random_ndarray((M, K), dtype)
-    np_weight0 = get_random_ndarray((N, K), dtype)
-    np_weight1 = get_random_ndarray((K, M), dtype)
-
-    params = {"weight0": np_weight0, "weight1": np_weight1}
-
-    rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
-    cutlass_rt_mod, dev, num_partition = build_by_cutlass(mod, params, sm, use_3xtf32=use_3xtf32)
-    cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor(
-        mod, params, sm, use_3xtf32=use_3xtf32
-    )
-    x = tvm.nd.array(np_data, device=dev)
-    cutlass_out = get_output(cutlass_rt_mod, ["input"], [x])
-    cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x])
-    ref_out = get_output(rt_mod_ref, ["input"], [x])
-
-    assert num_partition > 0
-    np.testing.assert_allclose(cutlass_out, ref_out, atol=atol, rtol=rtol)
-    np.testing.assert_allclose(cutlass_ansor_out, ref_out, atol=atol, rtol=rtol)
-
-    if run_benchmark:
-        print("CUTLASS:", cutlass_rt_mod.benchmark(dev, number=1, repeat=600))
-        print("CUTLASS with Ansor:", cutlass_ansor_rt_mod.benchmark(dev, number=1, repeat=600))
-        print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600))
-
-
-M = 128
-N = 128
-K = 128
-
-# Use larger M/N/K for significant performance improvement
-# M = 1024
-# N = 1024
-# K = 1024
-
-
-@tvm.testing.requires_cutlass
-def test_dense_transpose_dense():
-    verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()

From 834f7873d02a03c2064cb51c23600432f993c2fa Mon Sep 17 00:00:00 2001
From: hanqingchang <hanqingchang@kuaishou.com>
Date: Wed, 1 Feb 2023 14:42:32 +0800
Subject: [PATCH 6/6] fix lint

---
 tests/python/contrib/test_cutlass.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
index c52a22202df7..f3d2e98e8937 100644
--- a/tests/python/contrib/test_cutlass.py
+++ b/tests/python/contrib/test_cutlass.py
@@ -312,12 +312,16 @@ def profile_and_build(
     )
 
     if use_ansor:
-        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+        with tvm.transform.PassContext(
+            opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+        ):
             tasks, task_weights = auto_scheduler.extract_tasks(
                 mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass]
             )
         for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-            logging.info(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
+            logging.info(
+                f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) ====="
+            )
             logging.info(task.compute_dag)
 
         with tempfile.NamedTemporaryFile() as fp:
@@ -325,7 +329,9 @@ def profile_and_build(
 
             # auto-tuning is disabled by default
             if ansor_tuning:
-                measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10)
+                measure_ctx = auto_scheduler.LocalRPCMeasureContext(
+                    repeat=3, min_repeat_ms=200, timeout=10
+                )
                 tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
                 tuner.tune(
                     auto_scheduler.TuningOptions(
@@ -1056,10 +1062,18 @@ def verify_dense_transpose_dense(
 
     rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
     cutlass_rt_mod, dev, num_partition = profile_and_build(
-        mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=False,
+        mod,
+        params,
+        sm,
+        use_3xtf32=use_3xtf32,
+        use_ansor=False,
     )
     cutlass_ansor_rt_mod, dev, num_partition = profile_and_build(
-        mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=True,
+        mod,
+        params,
+        sm,
+        use_3xtf32=use_3xtf32,
+        use_ansor=True,
     )
     x = tvm.nd.array(np_data, device=dev)
     cutlass_out = get_output(cutlass_rt_mod, ["input"], [x])