From 299a3cdf947fd4c07ec098cf3752ea95577d8810 Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Thu, 11 Jul 2024 16:03:17 +0800 Subject: [PATCH] Customized Opt --- docs/conf.py | 2 + docs/how_to/tutorials/README.rst | 2 + docs/how_to/tutorials/customize_opt.py | 189 +++++++++++++++++++++++++ docs/index.rst | 5 + 4 files changed, 198 insertions(+) create mode 100644 docs/how_to/tutorials/README.rst create mode 100644 docs/how_to/tutorials/customize_opt.py diff --git a/docs/conf.py b/docs/conf.py index 5c1104b..b11c0ce 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -312,11 +312,13 @@ def fixup_tutorials(original_url: str) -> str: # Sphinx-Gallery Settings examples_dirs = [ f"{home_path}/get_started/tutorials", + f"{home_path}/how_to/tutorials/", f"{home_path}/deep_dive/tensor_ir/tutorials/", ] gallery_dirs = [ "get_started/tutorials/", + "how_to/tutorials/", "deep_dive/tensor_ir/tutorials/", ] diff --git a/docs/how_to/tutorials/README.rst b/docs/how_to/tutorials/README.rst new file mode 100644 index 0000000..9cec77e --- /dev/null +++ b/docs/how_to/tutorials/README.rst @@ -0,0 +1,2 @@ +HOW TO +------ diff --git a/docs/how_to/tutorials/customize_opt.py b/docs/how_to/tutorials/customize_opt.py new file mode 100644 index 0000000..9c1c111 --- /dev/null +++ b/docs/how_to/tutorials/customize_opt.py @@ -0,0 +1,189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +.. _customize_opt: + +Customize Optimization +====================== +One main design goal of Apache TVM is to enable easy customization of the optimization pipeline +for both research or development purposes and iterate the engineering optimizations. In this +tutorial we will + +.. contents:: Table of Contents + :local: + :depth: 1 +""" + +###################################################################### +# Review Overall Flow +# ------------------- +# .. figure:: ../../_static/img/overview.svg +# :align: center +# :width: 80% +# +# The overall flow consists of the following steps: +# +# - **Construct or Import a Model**: Construct a neural network model or import a pre-trained +# model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains +# all the information needed for compilation, including high-level Relax functions for +# computational graph, and low-level TensorIR functions for tensor program. +# - **Perform Composable Optimizations**: Perform a series of optimization transformations, +# such as graph optimizations, tensor program optimizations, and library dispatching. +# - **Build and Universal Deployment**: Build the optimized model to a deployable module to the +# universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators. +# + +import tempfile +import numpy as np +import tvm +from tvm import IRModule, relax +from tvm.relax.frontend import nn + +###################################################################### +# Composable IRModule Optimization +# -------------------------------- +# Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered +# around IRModule optimization can be composed with existing pipelines. Note that each optimization +# can focus on **part of the computation graph**, enabling partial lowering or partial optimization. +# +# In this tutorial, we will demonstrate how to optimize a model with Apache TVM Unity. + +###################################################################### +# Prepare a Relax Module +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# We first prepare a Relax module. The module can be imported from other frameworks, constructed +# with NN module frontend or TVMScript. Here we use a simple neural network model as an example. + + +class RelaxModel(nn.Module): + def __init__(self): + super(RelaxModel, self).__init__() + self.fc1 = nn.Linear(784, 256) + self.relu1 = nn.ReLU() + self.fc2 = nn.Linear(256, 10, bias=False) + + def forward(self, x): + x = self.fc1(x) + x = self.relu1(x) + x = self.fc2(x) + return x + + +input_shape = (1, 784) +mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}}) +mod.show() + +###################################################################### +# Library Dispatch +# ~~~~~~~~~~~~~~~~ +# We would like to quickly try out a variant of library optimization for certain platforms +# (e.g., GPU). We can write a certain dispatching pass for the specific platform and +# operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns. +# +# .. note:: +# This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting +# the flexibility of the optimization pipeline. In real-world cases, we can import multiple +# patterns and dispatch them to different kernels. + + +# Import cublas pattern +import tvm.relax.backend.contrib.cublas as _cublas + + +# Define a new pass for CUBLAS dispatch +@tvm.transform.module_pass(opt_level=0, name="CublasDispatch") +class CublasDispatch: + def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule: + # Check if CUBLAS is enabled + if not tvm.get_global_func("relax.ext.cublas", True): + raise Exception("CUBLAS is not enabled.") + + # Get interested patterns + patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")] + # Note in real-world cases, we usually get all patterns + # patterns = relax.backend.get_patterns_with_prefix("cublas") + + # Fuse ops by patterns and then run codegen + mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod) + mod = relax.transform.RunCodegen()(mod) + return mod + + +mod = CublasDispatch()(mod) +mod.show() + +###################################################################### +# After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused +# and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the +# other part is not changed, which means we can selectively dispatch the optimization for +# certain computation. + +###################################################################### +# Auto Tuning +# ~~~~~~~~~~~ +# Continuing from the previous example, we can further optimize the model with auto-tuning for +# the **rest part of the computation**. Here we demonstrate how to use the meta-schedule to auto-tune +# the model. +# +# We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase`` +# pass to apply the best configuration to the model. The tuning process will generate search space, +# tune the model and the following steps will apply the best configuration to the model. Before +# running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps`` +# + + +target = tvm.target.Target("nvidia/geforce-rtx-3090-ti") +with target, tempfile.TemporaryDirectory() as tmp_dir: + mod = tvm.ir.transform.Sequential( + [ + relax.transform.LegalizeOps(), + relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=64), + relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir), + ] + )(mod) + +mod.show() + +###################################################################### +# .. note:: +# This tutorial focuses on the demonstration of the optimization pipeline, instead of +# pushing the performance to the limit. The current optimization may not be the best. + +###################################################################### +# Deploy the Optimized Model +# -------------------------- +# We can build and deploy the optimized model to the TVM runtime. + +exec = relax.build(mod, target="cuda") +dev = tvm.device("cuda", 0) +vm = relax.VirtualMachine(exec, dev) +# Need to allocate data and params on GPU device +data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev) +gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params] +gpu_out = vm["forward"](data, *gpu_params).numpy() +print(gpu_out) + + +###################################################################### +# Summary +# ------- +# This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM. +# We can easily compose the optimization passes and customize the optimization for different parts of +# the computation graph. The flexibility of the optimization pipeline enables us to quickly iterate +# the optimization and improve the performance of the model. +# diff --git a/docs/index.rst b/docs/index.rst index 690349c..74d2f3a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,6 +29,11 @@ TVM Unity is developed under Apache TVM's Github Repo in a separate `unity` bran get_started/tutorials/quick_start get_started/tutorials/ir_module +.. toctree:: + :maxdepth: 1 + :caption: HOW TO + + how_to/tutorials/customize_opt .. The Deep Dive content is comprehensive .. we maintain a ``maxdepth`` of 2 to display more information on the main page.