From 560c8da96fa48a8d0b6b573883e885242da96374 Mon Sep 17 00:00:00 2001 From: hanqingchang Date: Thu, 12 Jan 2023 16:54:57 +0800 Subject: [PATCH 1/6] feat: combine cutlass and ansor --- .../tvm/auto_scheduler/relay_integration.py | 4 +- tests/python/contrib/test_cutlass_ansor.py | 259 ++++++++++++++++++ 2 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 tests/python/contrib/test_cutlass_ansor.py diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 52c7f44fcede..30e268473037 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -84,6 +84,7 @@ def extract_tasks( include_simple_tasks=False, dump_workload_to_dag_log=None, opt_level=3, + other_targets=[], ): """Extract tuning tasks from a relay program. @@ -125,12 +126,13 @@ def extract_tasks( old_verbose = dispatch_ctx.verbose dispatch_ctx.verbose = 0 + targets = [target] + other_targets errors = [] with env: # Wrap build call in a new thread to avoid the conflict # between python's multiprocessing and tvm's thread pool build_thread = threading.Thread( - target=call_all_topi_funcs, args=(mod, params, target, errors, opt_level) + target=call_all_topi_funcs, args=(mod, params, targets, errors, opt_level) ) build_thread.start() build_thread.join() diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py new file mode 100644 index 000000000000..24fd25f350d8 --- /dev/null +++ b/tests/python/contrib/test_cutlass_ansor.py @@ -0,0 +1,259 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import logging +import math +import tvm +from tvm import relay +from tvm.contrib.cudnn import conv_output_shape +import numpy as np +from tvm.runtime.vm import VirtualMachine +from tvm.relay import op as _op +from tvm.relay.op.contrib.cutlass import partition_for_cutlass +from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType +from tvm import auto_scheduler +from tvm.contrib.cutlass import ( + has_cutlass, + num_cutlass_partitions, + finalize_modules, + finalize_modules_vm, +) +import tvm.testing + +logging.basicConfig(level=logging.INFO) + + +def get_ref_rt_mod(mod, params, target="cuda"): + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params=params) + dev = tvm.device(target, 0) + rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) + return rt_mod, dev + + +def get_random_ndarray(shape, dtype): + if dtype == "int8": + return np.random.randint(-128, 128, shape).astype(dtype) + elif dtype == "uint8": + return np.random.randint(0, 256, shape).astype(dtype) + return np.random.uniform(-1, 1, shape).astype(dtype) + + +def get_output(rt_mod, names, inputs): + for name, inp in zip(names, inputs): + rt_mod.set_input(name, inp) + rt_mod.run() + return rt_mod.get_output(0).numpy() + + +def get_dense_transpose_dense(M, N, K, dtype="float16"): + """ + dense: [M, K] * [N, K] -> [M, N] + transpose: [M, N] -> [N, M] + dense: [N, M] * [K, M] -> [N, K] + + input: [M, K] + weight0: [N, K] + weight1: [K, M] + """ + in_shape = (M, K) + w0_shape = (N, K) + w1_shape = (K, M) + + input = relay.var("input", shape=in_shape, dtype=dtype) + w0 = relay.var("weight0", shape=w0_shape, dtype=dtype) + w1 = relay.var("weight1", shape=w1_shape, dtype=dtype) + + one = _op.const(1, dtype=dtype) + two = _op.const(2, dtype=dtype) + + out0 = relay.nn.dense(input, w0, out_dtype=dtype) + input1 = _op.transpose(out0, axes=(1, 0)) + out1 = relay.nn.dense(input1, w1, out_dtype=dtype) + return out1 + + +def build_by_cutlass( + mod, + params, + sm, + split_k_slices=[1], + tmp_dir="./tmp", + use_fast_math=False, + use_3xtf32=True, +): + logging.info("before partitioning:\n%s", mod) + mod = partition_for_cutlass(mod) + logging.info("after partitioning:\n%s", mod) + + num_cutlass_partition = num_cutlass_partitions(mod) + host = tvm.target.Target("llvm") + cuda = tvm.target.Target("cuda", host=host) + cutlass = tvm.target.Target( + { + "kind": "cutlass", + "sm": sm, + "use_3xtf32": use_3xtf32, + "split_k_slices": split_k_slices, + "profile_all_alignments": False, + "find_first_valid": True, + "use_multiprocessing": True, + "use_fast_math": use_fast_math, + "tmp_dir": tmp_dir, + }, + host=host, + ) + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=[cuda, cutlass], params=params) + lib = finalize_modules(lib, "compile.so", tmp_dir) + dev = tvm.device("cuda", 0) + rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) + return rt_mod, dev, num_cutlass_partition + + +def build_by_cutlass_ansor( + mod, + params, + sm, + split_k_slices=[1], + tmp_dir="./tmp", + use_fast_math=False, + use_3xtf32=True, + num_trials=10, +): + logging.info("before partitioning:\n%s", mod) + mod = partition_for_cutlass(mod) + logging.info("after partitioning:\n%s", mod) + + num_cutlass_partition = num_cutlass_partitions(mod) + host = tvm.target.Target("llvm") + cuda = tvm.target.Target("cuda", host=host) + cutlass = tvm.target.Target( + { + "kind": "cutlass", + "sm": sm, + "use_3xtf32": use_3xtf32, + "split_k_slices": split_k_slices, + "profile_all_alignments": False, + "find_first_valid": True, + "use_multiprocessing": True, + "use_fast_math": use_fast_math, + "tmp_dir": tmp_dir, + }, + host=host, + ) + + # extract tasks + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + tasks, task_weights = auto_scheduler.extract_tasks( + mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass]) + for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): + print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") + print(task.compute_dag) + + # auto-tuning + log_file = "cutlass_ansor.log" + measure_ctx = auto_scheduler.LocalRPCMeasureContext( + repeat=3, min_repeat_ms=200, timeout=10 + ) + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=num_trials, + runner=measure_ctx.runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ) + ) + + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay.build( + mod, + target=cuda, + target_host=host, + params=params, + ) + lib = finalize_modules(lib, "compile.so", tmp_dir) + dev = tvm.device("cuda", 0) + rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) + return rt_mod, dev, num_cutlass_partition + + +def verify_dense_transpose_dense( + func, + M, + N, + K, + ref_target="cuda", + sm=80, + atol=1e-5, + rtol=1e-5, + run_benchmark=False, + dtype="float16", + use_3xtf32=True, +): + assert has_cutlass() + if sm < 80 and dtype == "float32": + return + + mod = tvm.IRModule.from_expr(func) + typ = relay.transform.InferType()(mod)["main"].body.checked_type + np_data = get_random_ndarray((M, K), dtype) + np_weight0 = get_random_ndarray((N, K), dtype) + np_weight1 = get_random_ndarray((K, M), dtype) + + params = {"weight0": np_weight0, "weight1": np_weight1} + + rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target) + cutlass_rt_mod, dev, num_partition = build_by_cutlass(mod, params, sm, use_3xtf32=use_3xtf32) + cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor(mod, params, sm, use_3xtf32=use_3xtf32) + x = tvm.nd.array(np_data, device=dev) + cutlass_out = get_output(cutlass_rt_mod, ["input"], [x]) + cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x]) + ref_out = get_output(rt_mod_ref, ["input"], [x]) + + assert num_partition > 0 + np.testing.assert_allclose(cutlass_out, ref_out, atol=atol, rtol=rtol) + np.testing.assert_allclose(cutlass_ansor_out, ref_out, atol=atol, rtol=rtol) + + if run_benchmark: + print("CUTLASS:", cutlass_rt_mod.benchmark(dev, number=1, repeat=600)) + print("CUTLASS with Ansor:", cutlass_ansor_rt_mod.benchmark(dev, number=1, repeat=600)) + print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600)) + + +M = 128 +N = 128 +K = 128 + +# Use larger M/N/K for significant performance improvement +# M = 1024 +# N = 1024 +# K = 1024 + + +@tvm.testing.requires_cutlass +def test_dense_transpose_dense(): + verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K, sm=75, run_benchmark=True) + + +if __name__ == "__main__": + tvm.testing.main() From d43f1796ff3b095927f0bbfbcd3d53b4a145edd3 Mon Sep 17 00:00:00 2001 From: hanqingchang Date: Tue, 31 Jan 2023 11:26:34 +0800 Subject: [PATCH 2/6] use sm80 and disable run_benchmark --- tests/python/contrib/test_cutlass_ansor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py index 24fd25f350d8..92bbd64c99c4 100644 --- a/tests/python/contrib/test_cutlass_ansor.py +++ b/tests/python/contrib/test_cutlass_ansor.py @@ -252,7 +252,7 @@ def verify_dense_transpose_dense( @tvm.testing.requires_cutlass def test_dense_transpose_dense(): - verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K, sm=75, run_benchmark=True) + verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K) if __name__ == "__main__": From b86e9d9b9e8157ba7d74ec61b6ed10deb61885ca Mon Sep 17 00:00:00 2001 From: hanqingchang Date: Tue, 31 Jan 2023 11:55:12 +0800 Subject: [PATCH 3/6] fix lint error --- tests/python/contrib/test_cutlass_ansor.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py index 92bbd64c99c4..ca14b0a32101 100644 --- a/tests/python/contrib/test_cutlass_ansor.py +++ b/tests/python/contrib/test_cutlass_ansor.py @@ -77,9 +77,6 @@ def get_dense_transpose_dense(M, N, K, dtype="float16"): w0 = relay.var("weight0", shape=w0_shape, dtype=dtype) w1 = relay.var("weight1", shape=w1_shape, dtype=dtype) - one = _op.const(1, dtype=dtype) - two = _op.const(2, dtype=dtype) - out0 = relay.nn.dense(input, w0, out_dtype=dtype) input1 = _op.transpose(out0, axes=(1, 0)) out1 = relay.nn.dense(input1, w1, out_dtype=dtype) @@ -159,16 +156,15 @@ def build_by_cutlass_ansor( # extract tasks with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): tasks, task_weights = auto_scheduler.extract_tasks( - mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass]) + mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass] + ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") print(task.compute_dag) # auto-tuning log_file = "cutlass_ansor.log" - measure_ctx = auto_scheduler.LocalRPCMeasureContext( - repeat=3, min_repeat_ms=200, timeout=10 - ) + measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( @@ -224,7 +220,9 @@ def verify_dense_transpose_dense( rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target) cutlass_rt_mod, dev, num_partition = build_by_cutlass(mod, params, sm, use_3xtf32=use_3xtf32) - cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor(mod, params, sm, use_3xtf32=use_3xtf32) + cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor( + mod, params, sm, use_3xtf32=use_3xtf32 + ) x = tvm.nd.array(np_data, device=dev) cutlass_out = get_output(cutlass_rt_mod, ["input"], [x]) cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x]) @@ -256,4 +254,4 @@ def test_dense_transpose_dense(): if __name__ == "__main__": - tvm.testing.main() + tvm.testing.main() From 639ce62d20fe9eaac6a80f0ca057ad1f3bf44e1e Mon Sep 17 00:00:00 2001 From: hanqingchang Date: Tue, 31 Jan 2023 12:59:22 +0800 Subject: [PATCH 4/6] use tempfile; fix dangerous default value --- .../tvm/auto_scheduler/relay_integration.py | 8 ++- tests/python/contrib/test_cutlass_ansor.py | 51 +++++++++---------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 30e268473037..973cbf19bece 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -84,7 +84,7 @@ def extract_tasks( include_simple_tasks=False, dump_workload_to_dag_log=None, opt_level=3, - other_targets=[], + other_targets=None, ): """Extract tuning tasks from a relay program. @@ -106,6 +106,8 @@ def extract_tasks( A file to dump an association between the workload keys and the actual DAG opt_level : Optional[int] The optimization level of the task extractions. + other_targets: Optional[List[tvm.target.Target]] + Other targets for call_all_topi_funcs, e.g., cutlass target. Returns ------- @@ -126,7 +128,9 @@ def extract_tasks( old_verbose = dispatch_ctx.verbose dispatch_ctx.verbose = 0 - targets = [target] + other_targets + targets = [target] + if other_targets is not None: + targets += other_targets errors = [] with env: # Wrap build call in a new thread to avoid the conflict diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py index ca14b0a32101..931c9b28b287 100644 --- a/tests/python/contrib/test_cutlass_ansor.py +++ b/tests/python/contrib/test_cutlass_ansor.py @@ -15,21 +15,17 @@ # specific language governing permissions and limitations # under the License. import logging -import math +import tempfile import tvm from tvm import relay -from tvm.contrib.cudnn import conv_output_shape import numpy as np -from tvm.runtime.vm import VirtualMachine from tvm.relay import op as _op from tvm.relay.op.contrib.cutlass import partition_for_cutlass -from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType from tvm import auto_scheduler from tvm.contrib.cutlass import ( has_cutlass, num_cutlass_partitions, finalize_modules, - finalize_modules_vm, ) import tvm.testing @@ -163,30 +159,31 @@ def build_by_cutlass_ansor( print(task.compute_dag) # auto-tuning - log_file = "cutlass_ansor.log" - measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10) - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tuner.tune( - auto_scheduler.TuningOptions( - num_measure_trials=num_trials, - runner=measure_ctx.runner, - measure_callbacks=[ - auto_scheduler.RecordToFile(log_file), - ], + with tempfile.NamedTemporaryFile() as fp: + log_file = fp.name + measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10) + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=num_trials, + runner=measure_ctx.runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ) ) - ) - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, - ): - lib = relay.build( - mod, - target=cuda, - target_host=host, - params=params, - ) + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay.build( + mod, + target=cuda, + target_host=host, + params=params, + ) lib = finalize_modules(lib, "compile.so", tmp_dir) dev = tvm.device("cuda", 0) rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) From 1096f9417dac25a8482e320a0dedb741682f279f Mon Sep 17 00:00:00 2001 From: hanqingchang Date: Wed, 1 Feb 2023 14:22:42 +0800 Subject: [PATCH 5/6] merge cutlass_ansor test into test_cutlass.py --- tests/python/contrib/test_cutlass.py | 126 +++++++++- tests/python/contrib/test_cutlass_ansor.py | 254 --------------------- 2 files changed, 124 insertions(+), 256 deletions(-) delete mode 100644 tests/python/contrib/test_cutlass_ansor.py diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py index 753ee178f9d3..c52a22202df7 100644 --- a/tests/python/contrib/test_cutlass.py +++ b/tests/python/contrib/test_cutlass.py @@ -15,13 +15,16 @@ # specific language governing permissions and limitations # under the License. import logging +import tempfile import math import tvm from tvm import relay from tvm.contrib.cudnn import conv_output_shape import numpy as np +from tvm.relay import op as _op from tvm.runtime.vm import VirtualMachine from tvm.relay.op.contrib.cutlass import partition_for_cutlass +from tvm import auto_scheduler from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType from tvm.contrib.cutlass import ( has_cutlass, @@ -235,6 +238,32 @@ def get_conv2d_backward_weight( ) +def get_dense_transpose_dense(M, N, K, dtype="float16"): + """ + output = nn.dense(_op.transpose(nn.dense(input, weight0), axes=(1, 0)), weight1) + + dense0: [M, K] * [N, K] -> [M, N] + transpose: [M, N] -> [N, M] + dense1: [N, M] * [K, M] -> [N, K] + + input: [M, K] + weight0: [N, K] + weight1: [K, M] + """ + input_shape = (M, K) + weight0_shape = (N, K) + weight1_shape = (K, M) + + input = relay.var("input", shape=input_shape, dtype=dtype) + weight0 = relay.var("weight0", shape=weight0_shape, dtype=dtype) + weight1 = relay.var("weight1", shape=weight1_shape, dtype=dtype) + + output0 = relay.nn.dense(input, weight0, out_dtype=dtype) + input1 = _op.transpose(output0, axes=(1, 0)) + output = relay.nn.dense(input1, weight1, out_dtype=dtype) + return output + + def convert_conv2d_layout(mod, desired_layouts): with tvm.transform.PassContext(opt_level=3): seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)]) @@ -257,6 +286,8 @@ def profile_and_build( tmp_dir="./tmp", use_fast_math=False, use_3xtf32=True, + use_ansor=False, + ansor_tuning=False, ): logging.info("before partitioning:\n%s", mod) mod = partition_for_cutlass(mod) @@ -279,8 +310,47 @@ def profile_and_build( }, host=host, ) - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=[cuda, cutlass], params=params) + + if use_ansor: + with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + tasks, task_weights = auto_scheduler.extract_tasks( + mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass] + ) + for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): + logging.info(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") + logging.info(task.compute_dag) + + with tempfile.NamedTemporaryFile() as fp: + log_file = fp.name + + # auto-tuning is disabled by default + if ansor_tuning: + measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10) + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=100, + runner=measure_ctx.runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ) + ) + + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay.build( + mod, + target=cuda, + target_host=host, + params=params, + ) + else: + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=[cuda, cutlass], params=params) lib = finalize_modules(lib, "compile.so", tmp_dir) dev = tvm.device("cuda", 0) rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) @@ -959,5 +1029,57 @@ def test_conv2d_bwd(): ) +def verify_dense_transpose_dense( + func, + M, + N, + K, + ref_target="cuda", + sm=80, + atol=1e-5, + rtol=1e-5, + run_benchmark=False, + dtype="float16", + use_3xtf32=True, +): + assert has_cutlass() + if sm < 80 and dtype == "float32": + return + + mod = tvm.IRModule.from_expr(func) + typ = relay.transform.InferType()(mod)["main"].body.checked_type + np_data = get_random_ndarray((M, K), dtype) + np_weight0 = get_random_ndarray((N, K), dtype) + np_weight1 = get_random_ndarray((K, M), dtype) + + params = {"weight0": np_weight0, "weight1": np_weight1} + + rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target) + cutlass_rt_mod, dev, num_partition = profile_and_build( + mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=False, + ) + cutlass_ansor_rt_mod, dev, num_partition = profile_and_build( + mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=True, + ) + x = tvm.nd.array(np_data, device=dev) + cutlass_out = get_output(cutlass_rt_mod, ["input"], [x]) + cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x]) + ref_out = get_output(rt_mod_ref, ["input"], [x]) + + assert num_partition > 0 + np.testing.assert_allclose(cutlass_out, ref_out, atol=atol, rtol=rtol) + np.testing.assert_allclose(cutlass_ansor_out, ref_out, atol=atol, rtol=rtol) + + if run_benchmark: + print("CUTLASS:", cutlass_rt_mod.benchmark(dev, number=1, repeat=600)) + print("CUTLASS with Ansor:", cutlass_ansor_rt_mod.benchmark(dev, number=1, repeat=600)) + print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600)) + + +@tvm.testing.requires_cutlass +def test_dense_transpose_dense(): + verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K) + + if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/contrib/test_cutlass_ansor.py b/tests/python/contrib/test_cutlass_ansor.py deleted file mode 100644 index 931c9b28b287..000000000000 --- a/tests/python/contrib/test_cutlass_ansor.py +++ /dev/null @@ -1,254 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import logging -import tempfile -import tvm -from tvm import relay -import numpy as np -from tvm.relay import op as _op -from tvm.relay.op.contrib.cutlass import partition_for_cutlass -from tvm import auto_scheduler -from tvm.contrib.cutlass import ( - has_cutlass, - num_cutlass_partitions, - finalize_modules, -) -import tvm.testing - -logging.basicConfig(level=logging.INFO) - - -def get_ref_rt_mod(mod, params, target="cuda"): - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - dev = tvm.device(target, 0) - rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) - return rt_mod, dev - - -def get_random_ndarray(shape, dtype): - if dtype == "int8": - return np.random.randint(-128, 128, shape).astype(dtype) - elif dtype == "uint8": - return np.random.randint(0, 256, shape).astype(dtype) - return np.random.uniform(-1, 1, shape).astype(dtype) - - -def get_output(rt_mod, names, inputs): - for name, inp in zip(names, inputs): - rt_mod.set_input(name, inp) - rt_mod.run() - return rt_mod.get_output(0).numpy() - - -def get_dense_transpose_dense(M, N, K, dtype="float16"): - """ - dense: [M, K] * [N, K] -> [M, N] - transpose: [M, N] -> [N, M] - dense: [N, M] * [K, M] -> [N, K] - - input: [M, K] - weight0: [N, K] - weight1: [K, M] - """ - in_shape = (M, K) - w0_shape = (N, K) - w1_shape = (K, M) - - input = relay.var("input", shape=in_shape, dtype=dtype) - w0 = relay.var("weight0", shape=w0_shape, dtype=dtype) - w1 = relay.var("weight1", shape=w1_shape, dtype=dtype) - - out0 = relay.nn.dense(input, w0, out_dtype=dtype) - input1 = _op.transpose(out0, axes=(1, 0)) - out1 = relay.nn.dense(input1, w1, out_dtype=dtype) - return out1 - - -def build_by_cutlass( - mod, - params, - sm, - split_k_slices=[1], - tmp_dir="./tmp", - use_fast_math=False, - use_3xtf32=True, -): - logging.info("before partitioning:\n%s", mod) - mod = partition_for_cutlass(mod) - logging.info("after partitioning:\n%s", mod) - - num_cutlass_partition = num_cutlass_partitions(mod) - host = tvm.target.Target("llvm") - cuda = tvm.target.Target("cuda", host=host) - cutlass = tvm.target.Target( - { - "kind": "cutlass", - "sm": sm, - "use_3xtf32": use_3xtf32, - "split_k_slices": split_k_slices, - "profile_all_alignments": False, - "find_first_valid": True, - "use_multiprocessing": True, - "use_fast_math": use_fast_math, - "tmp_dir": tmp_dir, - }, - host=host, - ) - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=[cuda, cutlass], params=params) - lib = finalize_modules(lib, "compile.so", tmp_dir) - dev = tvm.device("cuda", 0) - rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) - return rt_mod, dev, num_cutlass_partition - - -def build_by_cutlass_ansor( - mod, - params, - sm, - split_k_slices=[1], - tmp_dir="./tmp", - use_fast_math=False, - use_3xtf32=True, - num_trials=10, -): - logging.info("before partitioning:\n%s", mod) - mod = partition_for_cutlass(mod) - logging.info("after partitioning:\n%s", mod) - - num_cutlass_partition = num_cutlass_partitions(mod) - host = tvm.target.Target("llvm") - cuda = tvm.target.Target("cuda", host=host) - cutlass = tvm.target.Target( - { - "kind": "cutlass", - "sm": sm, - "use_3xtf32": use_3xtf32, - "split_k_slices": split_k_slices, - "profile_all_alignments": False, - "find_first_valid": True, - "use_multiprocessing": True, - "use_fast_math": use_fast_math, - "tmp_dir": tmp_dir, - }, - host=host, - ) - - # extract tasks - with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): - tasks, task_weights = auto_scheduler.extract_tasks( - mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass] - ) - for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): - print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") - print(task.compute_dag) - - # auto-tuning - with tempfile.NamedTemporaryFile() as fp: - log_file = fp.name - measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10) - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tuner.tune( - auto_scheduler.TuningOptions( - num_measure_trials=num_trials, - runner=measure_ctx.runner, - measure_callbacks=[ - auto_scheduler.RecordToFile(log_file), - ], - ) - ) - - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, - ): - lib = relay.build( - mod, - target=cuda, - target_host=host, - params=params, - ) - lib = finalize_modules(lib, "compile.so", tmp_dir) - dev = tvm.device("cuda", 0) - rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) - return rt_mod, dev, num_cutlass_partition - - -def verify_dense_transpose_dense( - func, - M, - N, - K, - ref_target="cuda", - sm=80, - atol=1e-5, - rtol=1e-5, - run_benchmark=False, - dtype="float16", - use_3xtf32=True, -): - assert has_cutlass() - if sm < 80 and dtype == "float32": - return - - mod = tvm.IRModule.from_expr(func) - typ = relay.transform.InferType()(mod)["main"].body.checked_type - np_data = get_random_ndarray((M, K), dtype) - np_weight0 = get_random_ndarray((N, K), dtype) - np_weight1 = get_random_ndarray((K, M), dtype) - - params = {"weight0": np_weight0, "weight1": np_weight1} - - rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target) - cutlass_rt_mod, dev, num_partition = build_by_cutlass(mod, params, sm, use_3xtf32=use_3xtf32) - cutlass_ansor_rt_mod, dev, num_partition = build_by_cutlass_ansor( - mod, params, sm, use_3xtf32=use_3xtf32 - ) - x = tvm.nd.array(np_data, device=dev) - cutlass_out = get_output(cutlass_rt_mod, ["input"], [x]) - cutlass_ansor_out = get_output(cutlass_ansor_rt_mod, ["input"], [x]) - ref_out = get_output(rt_mod_ref, ["input"], [x]) - - assert num_partition > 0 - np.testing.assert_allclose(cutlass_out, ref_out, atol=atol, rtol=rtol) - np.testing.assert_allclose(cutlass_ansor_out, ref_out, atol=atol, rtol=rtol) - - if run_benchmark: - print("CUTLASS:", cutlass_rt_mod.benchmark(dev, number=1, repeat=600)) - print("CUTLASS with Ansor:", cutlass_ansor_rt_mod.benchmark(dev, number=1, repeat=600)) - print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600)) - - -M = 128 -N = 128 -K = 128 - -# Use larger M/N/K for significant performance improvement -# M = 1024 -# N = 1024 -# K = 1024 - - -@tvm.testing.requires_cutlass -def test_dense_transpose_dense(): - verify_dense_transpose_dense(get_dense_transpose_dense(M, N, K), M, N, K) - - -if __name__ == "__main__": - tvm.testing.main() From 834f7873d02a03c2064cb51c23600432f993c2fa Mon Sep 17 00:00:00 2001 From: hanqingchang Date: Wed, 1 Feb 2023 14:42:32 +0800 Subject: [PATCH 6/6] fix lint --- tests/python/contrib/test_cutlass.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py index c52a22202df7..f3d2e98e8937 100644 --- a/tests/python/contrib/test_cutlass.py +++ b/tests/python/contrib/test_cutlass.py @@ -312,12 +312,16 @@ def profile_and_build( ) if use_ansor: - with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): tasks, task_weights = auto_scheduler.extract_tasks( mod, params, cuda, include_simple_tasks=True, opt_level=3, other_targets=[cutlass] ) for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): - logging.info(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") + logging.info( + f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====" + ) logging.info(task.compute_dag) with tempfile.NamedTemporaryFile() as fp: @@ -325,7 +329,9 @@ def profile_and_build( # auto-tuning is disabled by default if ansor_tuning: - measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=3, min_repeat_ms=200, timeout=10) + measure_ctx = auto_scheduler.LocalRPCMeasureContext( + repeat=3, min_repeat_ms=200, timeout=10 + ) tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tuner.tune( auto_scheduler.TuningOptions( @@ -1056,10 +1062,18 @@ def verify_dense_transpose_dense( rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target) cutlass_rt_mod, dev, num_partition = profile_and_build( - mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=False, + mod, + params, + sm, + use_3xtf32=use_3xtf32, + use_ansor=False, ) cutlass_ansor_rt_mod, dev, num_partition = profile_and_build( - mod, params, sm, use_3xtf32=use_3xtf32, use_ansor=True, + mod, + params, + sm, + use_3xtf32=use_3xtf32, + use_ansor=True, ) x = tvm.nd.array(np_data, device=dev) cutlass_out = get_output(cutlass_rt_mod, ["input"], [x])