From 299a3cdf947fd4c07ec098cf3752ea95577d8810 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Thu, 11 Jul 2024 16:03:17 +0800
Subject: [PATCH] Customized Opt

---
 docs/conf.py                           |   2 +
 docs/how_to/tutorials/README.rst       |   2 +
 docs/how_to/tutorials/customize_opt.py | 189 +++++++++++++++++++++++++
 docs/index.rst                         |   5 +
 4 files changed, 198 insertions(+)
 create mode 100644 docs/how_to/tutorials/README.rst
 create mode 100644 docs/how_to/tutorials/customize_opt.py

diff --git a/docs/conf.py b/docs/conf.py
index 5c1104b..b11c0ce 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -312,11 +312,13 @@ def fixup_tutorials(original_url: str) -> str:
 # Sphinx-Gallery Settings
 examples_dirs = [
     f"{home_path}/get_started/tutorials",
+    f"{home_path}/how_to/tutorials/",
     f"{home_path}/deep_dive/tensor_ir/tutorials/",
 ]
 
 gallery_dirs = [
     "get_started/tutorials/",
+    "how_to/tutorials/",
     "deep_dive/tensor_ir/tutorials/",
 ]
 
diff --git a/docs/how_to/tutorials/README.rst b/docs/how_to/tutorials/README.rst
new file mode 100644
index 0000000..9cec77e
--- /dev/null
+++ b/docs/how_to/tutorials/README.rst
@@ -0,0 +1,2 @@
+HOW TO
+------
diff --git a/docs/how_to/tutorials/customize_opt.py b/docs/how_to/tutorials/customize_opt.py
new file mode 100644
index 0000000..9c1c111
--- /dev/null
+++ b/docs/how_to/tutorials/customize_opt.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+.. _customize_opt:
+
+Customize Optimization
+======================
+One main design goal of Apache TVM is to enable easy customization of the optimization pipeline
+for both research or development purposes and iterate the engineering optimizations. In this
+tutorial we will
+
+.. contents:: Table of Contents
+    :local:
+    :depth: 1
+"""
+
+######################################################################
+# Review Overall Flow
+# -------------------
+# .. figure:: ../../_static/img/overview.svg
+#    :align: center
+#    :width: 80%
+#
+# The overall flow consists of the following steps:
+#
+# - **Construct or Import a Model**: Construct a neural network model or import a pre-trained
+#   model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
+#   all the information needed for compilation, including high-level Relax functions for
+#   computational graph, and low-level TensorIR functions for tensor program.
+# - **Perform Composable Optimizations**: Perform a series of optimization transformations,
+#   such as graph optimizations, tensor program optimizations, and library dispatching.
+# - **Build and Universal Deployment**: Build the optimized model to a deployable module to the
+#   universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.
+#
+
+import tempfile
+import numpy as np
+import tvm
+from tvm import IRModule, relax
+from tvm.relax.frontend import nn
+
+######################################################################
+# Composable IRModule Optimization
+# --------------------------------
+# Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered
+# around IRModule optimization can be composed with existing pipelines. Note that each optimization
+# can focus on **part of the computation graph**, enabling partial lowering or partial optimization.
+#
+# In this tutorial, we will demonstrate how to optimize a model with Apache TVM Unity.
+
+######################################################################
+# Prepare a Relax Module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# We first prepare a Relax module. The module can be imported from other frameworks, constructed
+# with NN module frontend or TVMScript. Here we use a simple neural network model as an example.
+
+
+class RelaxModel(nn.Module):
+    def __init__(self):
+        super(RelaxModel, self).__init__()
+        self.fc1 = nn.Linear(784, 256)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(256, 10, bias=False)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu1(x)
+        x = self.fc2(x)
+        return x
+
+
+input_shape = (1, 784)
+mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}})
+mod.show()
+
+######################################################################
+# Library Dispatch
+# ~~~~~~~~~~~~~~~~
+# We would like to quickly try out a variant of library optimization for certain platforms
+# (e.g., GPU). We can write a certain dispatching pass for the specific platform and
+# operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns.
+#
+# .. note::
+#   This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting
+#   the flexibility of the optimization pipeline. In real-world cases, we can import multiple
+#   patterns and dispatch them to different kernels.
+
+
+# Import cublas pattern
+import tvm.relax.backend.contrib.cublas as _cublas
+
+
+# Define a new pass for CUBLAS dispatch
+@tvm.transform.module_pass(opt_level=0, name="CublasDispatch")
+class CublasDispatch:
+    def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
+        # Check if CUBLAS is enabled
+        if not tvm.get_global_func("relax.ext.cublas", True):
+            raise Exception("CUBLAS is not enabled.")
+
+        # Get interested patterns
+        patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")]
+        # Note in real-world cases, we usually get all patterns
+        # patterns = relax.backend.get_patterns_with_prefix("cublas")
+
+        # Fuse ops by patterns and then run codegen
+        mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod)
+        mod = relax.transform.RunCodegen()(mod)
+        return mod
+
+
+mod = CublasDispatch()(mod)
+mod.show()
+
+######################################################################
+# After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused
+# and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the
+# other part is not changed, which means we can selectively dispatch the optimization for
+# certain computation.
+
+######################################################################
+# Auto Tuning
+# ~~~~~~~~~~~
+# Continuing from the previous example, we can further optimize the model with auto-tuning for
+# the **rest part of the computation**. Here we demonstrate how to use the meta-schedule to auto-tune
+# the model.
+#
+# We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase``
+# pass to apply the best configuration to the model. The tuning process will generate search space,
+# tune the model and the following steps will apply the best configuration to the model. Before
+# running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps``
+#
+
+
+target = tvm.target.Target("nvidia/geforce-rtx-3090-ti")
+with target, tempfile.TemporaryDirectory() as tmp_dir:
+    mod = tvm.ir.transform.Sequential(
+        [
+            relax.transform.LegalizeOps(),
+            relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=64),
+            relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
+        ]
+    )(mod)
+
+mod.show()
+
+######################################################################
+# .. note::
+#   This tutorial focuses on the demonstration of the optimization pipeline, instead of
+#   pushing the performance to the limit. The current optimization may not be the best.
+
+######################################################################
+# Deploy the Optimized Model
+# --------------------------
+# We can build and deploy the optimized model to the TVM runtime.
+
+exec = relax.build(mod, target="cuda")
+dev = tvm.device("cuda", 0)
+vm = relax.VirtualMachine(exec, dev)
+# Need to allocate data and params on GPU device
+data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev)
+gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
+gpu_out = vm["forward"](data, *gpu_params).numpy()
+print(gpu_out)
+
+
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM.
+# We can easily compose the optimization passes and customize the optimization for different parts of
+# the computation graph. The flexibility of the optimization pipeline enables us to quickly iterate
+# the optimization and improve the performance of the model.
+#
diff --git a/docs/index.rst b/docs/index.rst
index 690349c..74d2f3a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -29,6 +29,11 @@ TVM Unity is developed under Apache TVM's Github Repo in a separate `unity` bran
    get_started/tutorials/quick_start
    get_started/tutorials/ir_module
 
+.. toctree::
+   :maxdepth: 1
+   :caption: HOW TO
+
+   how_to/tutorials/customize_opt
 
 .. The Deep Dive content is comprehensive
 .. we maintain a ``maxdepth`` of 2 to display more information on the main page.