diff --git a/python/tvm/meta_schedule/testing/torchbench/__init__.py b/python/tvm/meta_schedule/testing/torchbench/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/python/tvm/meta_schedule/testing/torchbench/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tvm/meta_schedule/testing/torchbench/run.py b/python/tvm/meta_schedule/testing/torchbench/run.py new file mode 100644 index 000000000000..f6984d1c9d10 --- /dev/null +++ b/python/tvm/meta_schedule/testing/torchbench/run.py @@ -0,0 +1,609 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This script is for benchmarking TVM performance on models from TorchBench. +It uses the TorchDynamo as the frontend to ingest models into TVM, and it also +leverages the benchmark util from TorchDynamo. + +TorchDynamo (https://github.com/pytorch/torchdynamo) and TorchBench +(https://github.com/pytorch/benchmark) need to be in the parent directory of TVM. +We need a local clone of these repos because torchbench and the benchmark runner +in TorchDynamo isn't designed to be used as a Python package. + +To setup the environment, run the following commands in the parent directory of TVM and with +the appropriate Python environment: +```bash +# torchdynamo requires nightly pytorch. If it fails to find the specified version, try +# installing the latest nightly pytorch. +pip3 install --pre \ + --extra-index-url https://download.pytorch.org/whl/nightly/cu116 \ + torch==1.13.0.dev20220926 \ + torchvision==0.14.0.dev20220926 \ + torchtext==0.14.0.dev20220926 + +git clone https://github.com/pytorch/torchdynamo +pushd torchdynamo +git checkout c537639f9712621dc04ca09908796dbbe86c354b +pip install -e . +popd + +sudo apt install git-lfs # git lfs is used for TorchBench +git clone https://github.com/pytorch/benchmark +pushd benchmark +python install.py --continue_on_fail # fambench_xlmr might fail to install +popd +``` + +To run a benchmark, the script can be run under 'tune' mode by +```bash +python python/tvm/meta_schedule/testing/torchbench/run.py \ + --mode tune \ + --model resnet50 \ + --target "nvidia/geforce-rtx-3070" \ + --work-dir ../workdir \ + --num-trials 20000 \ + --rpc-host \ + --rpc-port \ + --rpc-key \ +``` + +All available target tags (like nvidia/geforce-rtx-3070) can be found at +https://github.com/apache/tvm/blob/main/src/target/tag.cc + +Then the script can be run under 'eval' mode to actual benchmark the performance, +using the tuning database under the work directory. This can be executed on a different +machine than the one executes tuning (the database json files need to be inside +of the work directory). +```bash +python python/tvm/meta_schedule/testing/torchbench/run.py \ + --mode eval \ + --model resnet50 \ + --target "nvidia/geforce-rtx-3070" \ + --work-dir ../workdir \ + --num-trials 0 +``` + +Alternatively, both tuning and evaluation can be done in a single run on the same machine, +by +```bash +python python/tvm/meta_schedule/testing/torchbench/run.py \ + --mode all \ + --model resnet50 \ + --target "llvm -num-cores 6" \ + --work-dir ../workdir \ + --num-trials 0 +``` +""" + +# pylint: disable=logging-format-interpolation + +import argparse +import functools +import logging +import warnings +from enum import Enum +from typing import Callable, List, Tuple + +import numpy as np # type: ignore +import torch # type: ignore +from scipy.stats import ttest_ind # type: ignore + +import tvm +import tvm.relay +from tvm import meta_schedule as ms +from tvm.contrib.graph_executor import GraphModule +from tvm.meta_schedule.testing.torchbench.utils import ( + load_torchdynamo_benchmark_runner, + same, + timed, +) +from tvm.runtime.vm import VirtualMachine +from tvm.support import describe + +# Needs to be imported after the .utils is executed +import torchdynamo # type: ignore # isort: skip, pylint: disable=wrong-import-order + + +class RunMode(Enum): + """ + The running mode of this script. Available values are: + - tune: Only tune the model and create the tuning database. + - eval: Only benchmark model using pre-existing tuning database. + - all: Run both tuning and benchmark + """ + + ALL = "all" + TUNE = "tune" + EVAL = "eval" + + @property + def should_tune(self): + """ + Returns whether it should tune the model. + """ + return self != RunMode.EVAL + + @property + def should_eval(self): + """ + Returns whether it should actually benchmark the model. + """ + return self != RunMode.TUNE + + +class ResultComparisonMetric(Enum): + """ + This changes how it compares the resultl with the expected value during + accuracy check. + - cosine: Use the cosine similarity. It should be greater than 0.99. + - allclose-1e-4: Use the max element-wise absolute difference. It should be less than 1e-4. + """ + + COSINE = "cosine" + ALLCLOSE = "allclose-1e-4" + + +def parse_args(): + """ + Parse arguments + """ + args = argparse.ArgumentParser() + + args.add_argument( + "--mode", + type=RunMode, + default=RunMode.ALL, + help=RunMode.__doc__, + ) + args.add_argument( + "--batch-size", + type=int, + default=None, + help="The batch size of model input. Use TorchBench's default value if not specified.", + ) + args.add_argument( + "--result-metric", + type=ResultComparisonMetric, + default=ResultComparisonMetric.ALLCLOSE, + help=ResultComparisonMetric.__doc__, + ) + args.add_argument( + "--benchmark-repeat", + type=int, + default=10, + help="The number of times to repeat the benchmark measurement.", + ) + args.add_argument( + "--benchmark-warmup-rounds", + type=int, + default=5, + help="The number of rounds to warmup before starting to measure the performance.", + ) + + # Model selection + args.add_argument( + "--model", + type=str, + required=True, + help=""" + The name of model to run. It should a directory name under + https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models. + """, + ) + + # Tuning-related config + args.add_argument( + "--target", + type=tvm.target.Target, + required=True, + help="The target to tune and run benchmark for.", + ) + args.add_argument( + "--work-dir", + type=str, + required=True, + help=""" + The working directory to save intermediate results and store databases for compilation. + """, + ) + args.add_argument( + "--cache-dir", + type=str, + default=None, + help=""" + The directory to cache the generated network. + If not specified, the cache will be disabled. + """, + ) + args.add_argument( + "--num-trials", + type=int, + required=True, + help="The max number of trials to run MetaSchedule.", + ) + args.add_argument( + "--max-trials-per-task", + type=int, + default=None, + help=""" + The max number of trials to run per task extracted in MetaSchedule. + By default it's the same as --num-trials. + """, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + default="graph", + help="The backend to use for relay compilation(graph / vm).", + ) + # TODO(@yelite): Add a layout arg to transform the network after + # ingesting into Relay and before feeding into MetaSchedule. + + # Evaluator-related config + args.add_argument( + "--number", + type=int, + default=3, + help="The number of times to run the model for taking average in a single measurement.", + ) + args.add_argument( + "--repeat", + type=int, + default=1, + help="The number of times to repeat the measurement.", + ) + args.add_argument( + "--min-repeat-ms", + type=int, + default=100, + help=""" + Minimum repeat time in ms. The number of runs will be increased if the actual + repeat time is lowered than this. + """, + ) + args.add_argument( + "--adaptive-training", + action="store_true", + help="Whether to use adpative training for cost model.", + ) + args.add_argument( + "--cpu-flush", + action="store_true", + help="Whether to perform CPU cache flush.", + ) + + # RPC-related args + args.add_argument( + "--rpc-host", + type=str, + help="Host of the RPC Tracker for tuning. Use LocalRunner if not provided", + ) + args.add_argument( + "--rpc-port", + type=int, + help="Port of the RPC Tracker for tuning", + ) + args.add_argument( + "--rpc-key", + type=str, + help="Key of the RPC Tracker for tuning", + ) + + parsed = args.parse_args() + return parsed + + +logging.basicConfig( + format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG) +ARGS = parse_args() +IS_CUDA = ARGS.target.kind.name == "cuda" + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name +logger.setLevel(logging.INFO) + + +runner = load_torchdynamo_benchmark_runner( # pylint: disable=invalid-name + IS_CUDA, cosine_similarity=ARGS.result_metric == ResultComparisonMetric.COSINE +) + + +def get_metaschedule_runner() -> ms.runner.PyRunner: + """ + Get the Runner for MetaSchedule. + + It returns RPCRunner if --rpc-host is given, otherwise it returns LocalRunner + """ + if ARGS.rpc_host is not None: + assert ARGS.rpc_port is not None, "Missing rpc_port" + assert ARGS.rpc_key is not None, "Missing rpc_key" + return ms.runner.RPCRunner( + rpc_config=ms.runner.RPCConfig( + tracker_host=ARGS.rpc_host, + tracker_port=ARGS.rpc_port, + tracker_key=ARGS.rpc_key, + session_timeout_sec=600, + ), + evaluator_config=ms.runner.EvaluatorConfig( + number=ARGS.number, + repeat=ARGS.repeat, + min_repeat_ms=ARGS.min_repeat_ms, + enable_cpu_cache_flush=ARGS.cpu_flush, + ), + alloc_repeat=1, + ) + else: + warnings.warn("Falling back to Metaschedule LocalRunner because --rpc-host isn't provided.") + return ms.runner.LocalRunner() + + +def get_tune_config() -> ms.TuneConfig: + """ + Get the TuneConfig. + """ + if ARGS.mode.should_tune: + max_trials_per_task = ARGS.max_trials_per_task + max_trials_global = ARGS.num_trials + else: + max_trials_per_task = 0 + max_trials_global = 0 + + if max_trials_per_task is None: + max_trials_per_task = max_trials_global + + return ms.TuneConfig( + strategy="evolutionary", + num_trials_per_iter=64, + max_trials_per_task=max_trials_per_task, + max_trials_global=max_trials_global, + adaptive_training=ARGS.adaptive_training, + ) + + +def get_graph_executor_forward(mod: GraphModule, device: tvm.runtime.Device) -> Callable: + """ + Get the forward function for graph executor, in order to integrate with TorchDynamo. + """ + + def forward(*args): + if IS_CUDA: + torch.cuda.synchronize() + args = tuple(arg.contiguous() for arg in args) + for idx, arg in enumerate(args, 0): + mod.set_input( + f"inp_{idx}", + tvm.nd.from_dlpack(arg), + ) + mod.run() + device.sync() + result = [torch.from_dlpack(mod.get_output(i)) for i in range(mod.get_num_outputs())] + return result + + return forward + + +def get_vm_forward(virtual_machine: VirtualMachine, device: tvm.runtime.Device) -> Callable: + """ + Get the forward function for VM, in order to integrate with TorchDynamo. + """ + + def forward(*args): + if IS_CUDA: + torch.cuda.synchronize() + args = tuple(tvm.nd.from_dlpack(arg.contiguous()) for arg in args) + result = virtual_machine.invoke("main", *args) + device.sync() + + if isinstance(result, tvm.nd.NDArray): + result = [result] + return [torch.from_dlpack(m) for m in result] + + return forward + + +def create_tvm_task_collection_backend(tasks: List[ms.ExtractedTask]) -> Callable: + """ + This torchdynamo backend only collects the extracted tasks from Metaschedule. + It doesn't tune the model. + """ + + def backend(graph_module, example_inputs): + jit_mod = torch.jit.trace(graph_module, example_inputs) + shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)] + ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list) + + extracted_tasks = ms.extract_task_from_relay(ir_mod, ARGS.target, params) + logger.info("Extracted %d tasks", len(extracted_tasks)) + tasks.extend(extracted_tasks) + + return graph_module.forward + + return backend + + +def create_tvm_compilation_backend(database: ms.database.Database) -> Callable: + """ + This torchdynamo backend compiles the model using history best record from the + Metaschedule database. + """ + + def backend(graph_module, example_inputs): + # pylint: disable=import-outside-toplevel + from tvm.ir.transform import PassContext + + # pylint: enable=import-outside-toplevel + + jit_mod = torch.jit.trace(graph_module, example_inputs) + shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)] + ir_mod, params = tvm.relay.frontend.from_pytorch(jit_mod, shape_list) + + relay_build = {"graph": tvm.relay.build, "vm": tvm.relay.vm.compile}[ARGS.backend] + with ARGS.target, ms.utils.autotvm_silencer(), database: + with PassContext( + opt_level=3, + config={ + "relay.backend.use_meta_schedule": True, + "relay.backend.use_meta_schedule_dispatch": not IS_CUDA, + "relay.backend.tir_converter": "default", + }, + ): + lib = relay_build(ir_mod, target=ARGS.target, params=params) + + device = tvm.cuda(0) if IS_CUDA else tvm.cpu(0) + + if ARGS.backend == "graph": + mod = GraphModule(lib["default"](device)) + return get_graph_executor_forward(mod, device) + elif ARGS.backend == "vm": + vm = VirtualMachine(lib, device) # pylint: disable=invalid-name + return get_vm_forward(vm, device) + else: + raise RuntimeError(f"Unknown backend {ARGS.backend}") + + return backend + + +def format_time(seconds: float) -> str: + """ + Format elapsed time based on its value. + """ + if seconds > 1: + return f"{seconds:.3g}s" + else: + return f"{seconds * 1000:.3g}ms" + + +def is_output_correct(output: torch.Tensor, expected: torch.Tensor) -> bool: + """ + Check whether the output is correct. + """ + comparison_metric = ARGS.result_metric + if comparison_metric == ResultComparisonMetric.COSINE: + return same(expected, output, cosine_similarity=True) + elif comparison_metric == ResultComparisonMetric.ALLCLOSE: + return same(expected, output, tol=1e-4) + else: + raise RuntimeError(f"Unknown comparison metric {comparison_metric}") + + +def performance_experiment( + model_iter_fn: Callable, model: torch.nn.Module, example_inputs: Tuple[torch.Tensor] +) -> str: + """ + Performs the actual benchmarking + Simplified from https://github.com/pytorch/torchdynamo/blob/c537639f9712621dc04ca09908796dbbe86c354b/benchmarks/common.py#L494 pylint: disable=line-too-long + """ + timings = np.zeros((ARGS.benchmark_repeat, 2), np.float64) + + is_correct = True + + frozen_model_iter_fn = torchdynamo.run(model_iter_fn) + + for _ in range(ARGS.benchmark_warmup_rounds): + frozen_model_iter_fn(model, example_inputs) + model_iter_fn(model, example_inputs) + + for rep in range(ARGS.benchmark_repeat): + # interleave the runs to handle frequency scaling and load changes + timings[rep, 0], expected_output = timed( + model, model_iter_fn, example_inputs, return_result=True + ) + timings[rep, 1], actual_output = timed( + model, frozen_model_iter_fn, example_inputs, return_result=True + ) + is_correct = is_correct and is_output_correct(expected_output, actual_output) + + pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue + median = np.median(timings, axis=0) + speedup = median[0] / median[1] + logger.info( + f"eager:{format_time(median[0])} " + f"optimized:{format_time(median[1])} " + f"speedup:{speedup:.3f}x p:{pvalue:.3f}" + ) + if not is_correct: + logger.error("Result is incorrect.") + logger.error(f"Expected (PyTorch eager): {expected_output}") + logger.error(f"Actual (Optimized): {actual_output}") + + return "" + + +def get_torch_device_type(target: tvm.target.Target) -> str: + if target.kind.name == "llvm": + return "cpu" + elif target.kind.name == "cuda": + return "cuda" + else: + raise RuntimeError(f"Unsupported target {target}") + + +def main(): + """ + Entry point of the benchmark + """ + describe() + + if not ARGS.mode.should_tune: + ms_database = ms.default_config.database(None, ARGS.work_dir) + if len(ms_database) == 0: + raise RuntimeError( + "Script is runnig in eval mode while the tuning database is empty. " + "Please tune the model first." + ) + + if IS_CUDA and ARGS.cpu_flush: + warnings.warn( + "Benchmark is running on CUDA, while --cpu-flush is turned on. " + "This flag will have no effect on CUDA." + ) + + try: + _, name, model, example_inputs, batch_size = runner.load_model( + get_torch_device_type(ARGS.target), + ARGS.model, + batch_size=ARGS.batch_size, + ) + logger.info( + f"batch size: {batch_size} input shape: {[input.shape for input in example_inputs]}" + ) + except NotImplementedError: + logging.exception(f"{ARGS.model} failed to load") + return + + tuning_tasks: List[ms.ExtractedTask] = [] + task_collect_ctx = torchdynamo.optimize(create_tvm_task_collection_backend(tuning_tasks)) + task_collect_ctx(runner.model_iter_fn)(model, example_inputs) + + database = ms.tune_extracted_tasks( + extracted_tasks=tuning_tasks, + config=get_tune_config(), + work_dir=ARGS.work_dir, + runner=get_metaschedule_runner(), # type: ignore + ) + + if ARGS.mode.should_eval: + torchdynamo.reset() + model_compile_ctx = torchdynamo.optimize(create_tvm_compilation_backend(database)) + experiment = functools.partial(performance_experiment, runner.model_iter_fn) + runner.run_one_model(name, model, example_inputs, model_compile_ctx, experiment) + + +if __name__ == "__main__": + main() diff --git a/python/tvm/meta_schedule/testing/torchbench/utils.py b/python/tvm/meta_schedule/testing/torchbench/utils.py new file mode 100644 index 000000000000..f5a745ea008a --- /dev/null +++ b/python/tvm/meta_schedule/testing/torchbench/utils.py @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Helper functions for running TorchBench through the benchmark functions +from TorchDynamo. +""" + +import os +import sys +from dataclasses import dataclass + +import torch # type: ignore + + +def find_torchdynamo() -> str: + """ + Find the directory of TorchDynamo repo. + + It can't directly import the benchmark runner in TorchDynamo + becuase it isn't designed to be used as a Python package. + """ + candidates = [ + "torchdynamo", + "../torchdynamo", + "../../torchdynamo", + ] + for library_dir in candidates: + if os.path.exists(f"{library_dir}/benchmarks"): + return library_dir + + raise RuntimeError( + """ + Cannot find directory for torchdynamo. + You need to clone https://github.com/pytorch/torchdynamo to the parent directory of cwd. + """ + ) + + +DYNAMO_DIR = find_torchdynamo() +sys.path.append(DYNAMO_DIR) +sys.path.append(f"{DYNAMO_DIR}/benchmarks") + +# pylint: disable=wrong-import-position, unused-import +from benchmarks.common import same, timed # type: ignore +from torchbench import TorchBenchmarkRunner # type: ignore + +# pylint: disable=wrong-import-position, unused-import + + +def load_torchdynamo_benchmark_runner( + is_cuda: bool, cosine_similarity: bool = False +) -> TorchBenchmarkRunner: + """ + Load the benchmark runner from TorchDynamo. + """ + + @dataclass + class RunnerArgs: + """ + This class simulates the parsed args required by the benchmark code from TorchDynamo. + """ + + ci: bool = False # Whether runs in CI mode. pylint: disable=invalid-name + training: bool = False # Whether it benchmarks training workload. + use_eval_mode: bool = True # Whether the model should be in eval mode. + dynamic_shapes: bool = False # Whether runs the model in dynamic shape mode. + float16: bool = False # Whether to cast model and inputs to float16 + float32: bool = False # Whether to cast model and inputs to float32 + + accuracy: bool = False # Whether to perform a accuracy test + performance: bool = True # Whether to perform a performance test + + cosine: bool = False # Whether to use consine similarity to check if output is correct. + + args = RunnerArgs(cosine=cosine_similarity) + + runner = TorchBenchmarkRunner() + runner.args = args + runner.model_iter_fn = runner.forward_pass + + if is_cuda: + # pylint: disable=import-outside-toplevel + import benchmarks.common # type: ignore + + # pylint: enable=import-outside-toplevel + + benchmarks.common.synchronize = torch.cuda.synchronize + + return runner