From 135cc0318f2f02a24e94dcd5ba1fec765fa9664b Mon Sep 17 00:00:00 2001
From: Chris Hoge <chris@hogepodge.com>
Date: Fri, 6 Aug 2021 17:11:40 +0000
Subject: [PATCH 1/2] Proof of concept for documentation refactor.

Proposal for documentation refactor. Changes include:

Major changes to top-level structure.
Separation of user and developer docs.
Reclassifying many tutorials as how-tos.
Moving sphinx galleries to top level "gallery" directory.
Consistent naming between galleries and top-level structure
---
 docs/conf.py                                  |  16 +-
 docs/dev_deep_dive/index.rst                  |  25 +
 docs/dev_how_tos/index.rst                    |  25 +
 docs/dev_reference/index.rst                  |  24 +
 docs/dev_tutorials/index.rst                  |  24 +
 docs/index.rst                                |  51 +-
 docs/user_deep_dive/index.rst                 |  24 +
 docs/user_how_tos/index.rst                   |  23 +
 docs/user_reference/index.rst                 |  23 +
 docs/user_tutorials/index.rst                 |  25 +
 gallery/user_tutorials/README.txt             |   4 +
 .../auto_scheduler_matmul_x86.py              | 214 +++++
 gallery/user_tutorials/autotvm_matmul_x86.py  | 377 ++++++++
 gallery/user_tutorials/autotvm_relay_x86.py   | 476 +++++++++
 .../cross_compilation_and_rpc.py              | 265 +++++
 gallery/user_tutorials/install.py             |  50 +
 gallery/user_tutorials/introduction.py        | 134 +++
 gallery/user_tutorials/relay_quick_start.py   | 155 +++
 .../user_tutorials/tensor_expr_get_started.py | 903 ++++++++++++++++++
 .../tvmc_command_line_driver.py               | 511 ++++++++++
 20 files changed, 3308 insertions(+), 41 deletions(-)
 create mode 100644 docs/dev_deep_dive/index.rst
 create mode 100644 docs/dev_how_tos/index.rst
 create mode 100644 docs/dev_reference/index.rst
 create mode 100644 docs/dev_tutorials/index.rst
 create mode 100644 docs/user_deep_dive/index.rst
 create mode 100644 docs/user_how_tos/index.rst
 create mode 100644 docs/user_reference/index.rst
 create mode 100644 docs/user_tutorials/index.rst
 create mode 100644 gallery/user_tutorials/README.txt
 create mode 100644 gallery/user_tutorials/auto_scheduler_matmul_x86.py
 create mode 100644 gallery/user_tutorials/autotvm_matmul_x86.py
 create mode 100644 gallery/user_tutorials/autotvm_relay_x86.py
 create mode 100644 gallery/user_tutorials/cross_compilation_and_rpc.py
 create mode 100644 gallery/user_tutorials/install.py
 create mode 100644 gallery/user_tutorials/introduction.py
 create mode 100644 gallery/user_tutorials/relay_quick_start.py
 create mode 100644 gallery/user_tutorials/tensor_expr_get_started.py
 create mode 100644 gallery/user_tutorials/tvmc_command_line_driver.py

diff --git a/docs/conf.py b/docs/conf.py
index 4a0455214db3..6962c448ed53 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -197,11 +197,12 @@ def git_describe_version(original_version):
 
 from sphinx_gallery.sorting import ExplicitOrder
 
-examples_dirs = ["../tutorials/", "../vta/tutorials/"]
-gallery_dirs = ["tutorials", "vta/tutorials"]
+examples_dirs = ["../gallery/user_tutorials", "../tutorials/", "../vta/tutorials/"]
+gallery_dirs = ["user_tutorials", "tutorials", "vta/tutorials"]
 
 subsection_order = ExplicitOrder(
     [
+        "../gallery/user_tutorials",
         "../tutorials/get_started",
         "../tutorials/frontend",
         "../tutorials/language",
@@ -223,6 +224,17 @@ def git_describe_version(original_version):
 # The unlisted files are sorted by filenames.
 # The unlisted files always appear after listed files.
 within_subsection_order = {
+    "user_tutorials": [
+        "introduction.py",
+        "install.py",
+        "tvmc_command_line_driver.py",
+        "autotvm_relay_x86.py",
+        "tensor_expr_get_started.py",
+        "autotvm_matmul_x86.py",
+        "auto_scheduler_matmul_x86.py",
+        "cross_compilation_and_rpc.py",
+        "relay_quick_start.py",
+    ],
     "get_started": [
         "introduction.py",
         "install.py",
diff --git a/docs/dev_deep_dive/index.rst b/docs/dev_deep_dive/index.rst
new file mode 100644
index 000000000000..41189226b8c1
--- /dev/null
+++ b/docs/dev_deep_dive/index.rst
@@ -0,0 +1,25 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _deep_dive:
+
+Deep Dive
+=========
+TVM Developer Deep Dives
+
+Refactor placeholder
+
diff --git a/docs/dev_how_tos/index.rst b/docs/dev_how_tos/index.rst
new file mode 100644
index 000000000000..e8cce9365df2
--- /dev/null
+++ b/docs/dev_how_tos/index.rst
@@ -0,0 +1,25 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _reference:
+
+Deep Dive
+=========
+TVM Developer Deep Dive
+
+Refactor placeholder
+
diff --git a/docs/dev_reference/index.rst b/docs/dev_reference/index.rst
new file mode 100644
index 000000000000..8a118d442e0b
--- /dev/null
+++ b/docs/dev_reference/index.rst
@@ -0,0 +1,24 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _reference:
+
+Reference
+=========
+TVM Developer Reference
+
+Placeholder for docs refactor
diff --git a/docs/dev_tutorials/index.rst b/docs/dev_tutorials/index.rst
new file mode 100644
index 000000000000..2705f21907b0
--- /dev/null
+++ b/docs/dev_tutorials/index.rst
@@ -0,0 +1,24 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _tutorial:
+
+Tutorial
+========
+TVM Developer Tutorial
+
+Placeholder for documentation refactor
diff --git a/docs/index.rst b/docs/index.rst
index 491c42712e9a..ad1cfc49d4dd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -34,57 +34,30 @@ For Developers
   architecture of TVM and/or actively develop on the project.
 - :doc:`dev/how_to` gives quick development tips on various topics.
 
-
-.. toctree::
-   :maxdepth: 1
-   :caption: How to
-   :hidden:
-
-   install/index
-   contribute/index
-   deploy/index
-   dev/how_to
-   microtvm/index
-   errors
-   faq
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Tutorials
-   :hidden:
-
-   tutorials/index
-
-
-.. toctree::
-   :maxdepth: 2
-   :caption: References
-   :hidden:
-
-   langref/index
-   api/python/index
-   api/links
-
 .. toctree::
    :maxdepth: 2
+   :caption: TVM User Guide
    :hidden:
-   :caption: Deep Dive
 
-   dev/index
+   user_tutorials/index
+   user_how_tos/index
+   user_deep_dive/index
+   user_reference/index
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
+   :caption: TVM Developer Guide
    :hidden:
-   :caption: MISC
-
-   vta/index
-   profiling/index
 
+   dev_tutorials/index
+   dev_how_tos/index
+   dev_deep_dive/index
+   dev_reference/index
 
 Index
 -----
-
 .. toctree::
    :maxdepth: 1
+   :caption: Index
 
    genindex
diff --git a/docs/user_deep_dive/index.rst b/docs/user_deep_dive/index.rst
new file mode 100644
index 000000000000..2e116ddbcc70
--- /dev/null
+++ b/docs/user_deep_dive/index.rst
@@ -0,0 +1,24 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _user_guide:
+
+Deep Dive
+=========
+TVM User Deep Dives
+
+refactor placeholder until content is in place
diff --git a/docs/user_how_tos/index.rst b/docs/user_how_tos/index.rst
new file mode 100644
index 000000000000..cc59879e1057
--- /dev/null
+++ b/docs/user_how_tos/index.rst
@@ -0,0 +1,23 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _user_guide:
+
+How To
+=======
+TVM How Tos
+
diff --git a/docs/user_reference/index.rst b/docs/user_reference/index.rst
new file mode 100644
index 000000000000..cc59879e1057
--- /dev/null
+++ b/docs/user_reference/index.rst
@@ -0,0 +1,23 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _user_guide:
+
+How To
+=======
+TVM How Tos
+
diff --git a/docs/user_tutorials/index.rst b/docs/user_tutorials/index.rst
new file mode 100644
index 000000000000..4f01312426f2
--- /dev/null
+++ b/docs/user_tutorials/index.rst
@@ -0,0 +1,25 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _user_guide:
+
+Tutorial
+=========
+TVM User Tutorial
+
+This is a placeholder and is overwritten by Spinx Gallery
+/tvm/gallery/user_tutorials
diff --git a/gallery/user_tutorials/README.txt b/gallery/user_tutorials/README.txt
new file mode 100644
index 000000000000..0d5a287feff4
--- /dev/null
+++ b/gallery/user_tutorials/README.txt
@@ -0,0 +1,4 @@
+Tutorials
+---------
+
+Within this gallery is an introduction to TVM.
diff --git a/gallery/user_tutorials/auto_scheduler_matmul_x86.py b/gallery/user_tutorials/auto_scheduler_matmul_x86.py
new file mode 100644
index 000000000000..f9fb3615aedc
--- /dev/null
+++ b/gallery/user_tutorials/auto_scheduler_matmul_x86.py
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Optimizing Operators with Auto-scheduling
+=========================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
+            `Chengfan Jia <https://github.com/jcf94/>`_
+
+In this tutorial, we will show how TVM's Auto Scheduling feature can find
+optimal schedules without the need for writing a custom template.
+
+Different from the template-based :doc:`AutoTVM <autotvm_matmul_x86>` which relies on
+manual templates to define the search space, the auto-scheduler does not
+require any templates.  Users only need to write the computation declaration
+without any schedule commands or templates.  The auto-scheduler can
+automatically generate a large search space and find a good schedule in the
+space.
+
+We use matrix multiplication as an example in this tutorial.
+
+.. note::
+  Note that this tutorial will not run on Windows or recent versions of macOS. To
+  get it to run, you will need to wrap the body of this tutorial in a :code:`if
+  __name__ == "__main__":` block.
+"""
+
+import os
+
+import numpy as np
+import tvm
+from tvm import te, auto_scheduler
+
+################################################################################
+# Defining the Matrix Multiplication
+# ----------------------------------
+# To start, we define a matrix multiplication with a bias addition.  Note that
+# this uses standard operations available in TVMs Tensor Expression language.
+# The major difference is the use of the `auto_sceduler` decorator at the top
+# of the function definition.  The function should return a list of
+# input/output tensors.  From these tensors, the auto-scheduler can get the
+# whole computational graph.
+
+
+@auto_scheduler.register_workload  # Note the auto_scheduler decorator
+def matmul_add(N, L, M, dtype):
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+    C = te.placeholder((N, M), name="C", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    matmul = te.compute(
+        (N, M),
+        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
+        name="matmul",
+        attrs={"layout_free_placeholders": [B]},  # enable automatic layout transform for tensor B
+    )
+    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
+
+    return [A, B, C, out]
+
+
+################################################################################
+# Create the search task
+# ----------------------
+# With the function defined, we can now create the task for the auto_scheduler
+# to search against. We specify the particular parameters for this matrix
+# multiplication, in this case a multiplication of to square matricies of size
+# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32"
+#
+# .. note:: Improve performance with custom targets
+#   In order for TVM to take full advantage of specific hardware platforms,
+#   you will want to manuall specify your CPU capabilities. For example:
+#   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
+#   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
+
+target = tvm.target.Target("llvm")
+N = L = M = 1024
+task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
+
+# Inspect the computational graph
+print("Computational DAG:")
+print(task.compute_dag)
+
+################################################################################
+# Set Parameters for Auto-Scheduler
+# ---------------------------------
+# Next, we set parameters for the auto-scheduler.
+#
+# * :code:`num_measure_trials` is the number of measurement trials we can use
+#   during the search.  We only make 10 trials in this tutorial for a fast
+#   demonstration. In practice, 1000 is a good value for the search to converge.
+#   You can do more trials according to your time budget.
+# * In addition, we use :code:`RecordToFile` to log measurement records into a
+#   file `matmul.json`.  The measurement records can be used to query the history
+#   best, resume the search, and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions` for more parameters
+
+log_file = "matmul.json"
+tune_option = auto_scheduler.TuningOptions(
+    num_measure_trials=10,
+    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    verbose=2,
+)
+
+################################################################################
+# Run the search
+# --------------
+# Now we get all inputs ready. Pretty simple, isn't it?  We can kick off the
+# search and let the auto-scheduler do its magic.  After some measurement
+# trials, we can load the best schedule from the log file and apply it.
+
+# Run auto-tuning (search)
+task.tune(tune_option)
+# Apply the best schedule
+sch, args = task.apply_best(log_file)
+
+################################################################################
+# Inspecting the Optimized Schedule
+# ---------------------------------
+# We can lower the schedule to see the IR after auto-scheduling.  The
+# auto-scheduler correctly performs optimizations including multi-level tiling,
+# layout transformation, parallelization, vectorization, unrolling, and
+# operator fusion.
+
+print("Lowered TIR:")
+print(tvm.lower(sch, args, simple_mode=True))
+
+################################################################################
+# Check correctness and evaluate performance
+# ------------------------------------------
+# We build the binary and check its correctness and performance.
+
+func = tvm.build(sch, args, target)
+a_np = np.random.uniform(size=(N, L)).astype(np.float32)
+b_np = np.random.uniform(size=(L, M)).astype(np.float32)
+c_np = np.random.uniform(size=(N, M)).astype(np.float32)
+out_np = a_np.dot(b_np) + c_np
+
+dev = tvm.cpu()
+a_tvm = tvm.nd.array(a_np, device=dev)
+b_tvm = tvm.nd.array(b_np, device=dev)
+c_tvm = tvm.nd.array(c_np, device=dev)
+out_tvm = tvm.nd.empty(out_np.shape, device=dev)
+func(a_tvm, b_tvm, c_tvm, out_tvm)
+
+# Check results
+np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)
+
+# Evaluate execution time.
+evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
+print(
+    "Execution time of this operator: %.3f ms"
+    % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
+)
+
+
+################################################################################
+# Using the record file
+# ---------------------
+# During the search, all measurement records are logged into the record file
+# "matmul.json". The measurement records can be used to re-apply search
+# results, resume the search, and perform other analyses.
+#
+# Here is an example where we load the best schedule from a file, and print the
+# equivalent python schedule API. This can be used for debugging and learning
+# the behavior of the auto-scheduler.
+
+print("Equivalent python schedule:")
+print(task.print_best(log_file))
+
+################################################################################
+# A more complicated example is to resume the search.  In this case, we need to
+# create the search policy and cost model by ourselves and resume the status of
+# search policy and cost model with the log file.  In the example below we
+# resume the status and do more 5 trials.
+
+
+def resume_search(task, log_file):
+    print("Resume search:")
+    cost_model = auto_scheduler.XGBModel()
+    cost_model.update_from_file(log_file)
+    search_policy = auto_scheduler.SketchPolicy(
+        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
+    )
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
+    )
+    task.tune(tune_option, search_policy=search_policy)
+
+
+resume_search(task, log_file)
+
+################################################################################
+# Final Notes and Summary
+# -----------------------
+# In this tutorial, we have shown how to use the TVM Auto-Scheduler to
+# automatically optimize a matrix multiplication, without the need to specify a
+# search template.  It ends a series of examples that starts from the Tensor
+# Expression (TE) language that demonstrates how TVM can optimize computational
+# operations.
diff --git a/gallery/user_tutorials/autotvm_matmul_x86.py b/gallery/user_tutorials/autotvm_matmul_x86.py
new file mode 100644
index 000000000000..f9b33b894192
--- /dev/null
+++ b/gallery/user_tutorials/autotvm_matmul_x86.py
@@ -0,0 +1,377 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-autotvm-matmul-x86:
+
+Optimizing Operators with Schedule Templates and AutoTVM
+========================================================
+**Authors**:
+`Lianmin Zheng <https://github.com/merrymercy>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+In this tutorial, we show how the TVM Tensor Expression (TE) language
+can be used to write schedule templates that can be searched by AutoTVM to
+find the optimal schedule. This process is called Auto-Tuning, which helps
+automate the process of optimizing tensor computation.
+
+This tutorial builds on the previous `tutorial on how to write a matrix
+multiplication using TE <tensor_expr_get_started>`.
+
+There are two steps in auto-tuning.
+
+- The first step is defining a search space.
+- The second step is running a search algorithm to explore through this space.
+
+In this tutorial, you can learn how to perform these two steps in TVM. The whole
+workflow is illustrated by a matrix multiplication example.
+
+.. note::
+  Note that this tutorial will not run on Windows or recent versions of macOS.
+  To get it to run, you will need to wrap the body of this tutorial in a
+  :code:`if __name__ == "__main__":` block.
+"""
+
+################################################################################
+# Install dependencies
+# --------------------
+# To use autotvm package in TVM, we need to install some extra dependencies.
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost cloudpickle
+#
+# To make TVM run faster in tuning, it is recommended to use cython as FFI of
+# TVM. In the root directory of TVM, execute:
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Begin by importing the required packages.
+
+import logging
+import sys
+
+import numpy as np
+import tvm
+from tvm import te
+import tvm.testing
+
+# the module is called `autotvm`
+from tvm import autotvm
+
+################################################################################
+# Basic Matrix Multiplication with TE
+# -----------------------------------
+# Recall the basic implementation of matrix multiplication using TE. We write
+# it down here with a few changes. We will wrap the multiplication in a python
+# function definition. For simplicity, we will focus our attention on a split
+# optimization, using a fixed value that defines the block size of the
+# reordering.
+
+
+def matmul_basic(N, L, M, dtype):
+
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    yo, yi = s[C].split(y, 8)
+    xo, xi = s[C].split(x, 8)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+
+################################################################################
+# Matrix Multiplication with AutoTVM
+# ----------------------------------
+# In the previous schedule code, we use a constant "8" as the tiling factor.
+# However, it might not be the best one because the best tiling factor depends
+# on real hardware environment and input shape.
+#
+# If you want the schedule code to be portable across a wider range of input
+# shapes and target hardware, it is better to define a set of candidate values
+# and pick the best one according to the measurement results on target
+# hardware.
+#
+# In autotvm, we can define a tunable parameter, or a "knob" for such kind of
+# value.
+
+################################################################################
+# A Basic Matrix Multiplication Template
+# --------------------------------------
+# We begin with an example of how to create a tunable parameter set for the
+# block size of the `split` scheduling operation.
+
+# Matmul V1: List candidate values
+@autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
+def matmul_v1(N, L, M, dtype):
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    # 2. get the config object
+    cfg = autotvm.get_config()
+
+    # 3. define search space
+    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
+    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
+
+    # 4. schedule according to config
+    yo, yi = s[C].split(y, cfg["tile_y"].val)
+    xo, xi = s[C].split(x, cfg["tile_x"].val)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+
+################################################################################
+# Here we make four modifications to the previous schedule code and get a
+# tunable "template". We can explain the modifications one by one.
+#
+# 1. Use a decorator to mark this function as a simple template.
+# 2. Get a config object: You can regard this :code:`cfg` as an argument of
+#    this function but we obtain it in a different way. With this argument, this
+#    function is no longer a deterministic schedule. Instead, we can pass
+#    different configurations to this function and get different schedules. A
+#    function that uses a configuration object like this is called a "template".
+#
+#    To make the template function more compact, we can do two things to define
+#    the parameter search space within a single function.
+#
+#    1. Define a search space across a set values. This is done by making
+#       :code:`cfg` a :any:`ConfigSpace` object. It will collect all of the
+#       tunable knobs in this function and build a search space from it.
+#    2. Schedule according to an entity in this space. This is done by making
+#       :code:`cfg` a :any:`ConfigEntity` object. When it is a
+#       :any:`ConfigEntity`, it will ignore all space definition API (namely,
+#       :code:`cfg.define_XXXXX(...)`). Instead, it will store deterministic
+#       values for all tunable knobs, and we schedule according to these values.
+#
+#    During auto-tuning, we will first call this template with a
+#    :any:`ConfigSpace` object to build the search space. Then we call this
+#    template with different :any:`ConfigEntity` in the built space to get
+#    different schedules. Finally we will measure the code generated by
+#    different schedules and pick the best one.
+#
+# 3. Define two tunable knobs. The first one is :code:`tile_y` with 5 possible
+#    values. The second one is :code:`tile_x` with a same list of possible values.
+#    These two knobs are independent, so they span a search space with size 25 =
+#    5x5.
+# 4. The configuration knobs are passed to the :code:`split` schedule
+#    operation, allowing us to schedule according to the 5x5 deterministic values
+#    we previously defined in :code:`cfg`.
+
+################################################################################
+# A Matrix Multiplication Template with the Advanced Parameter API
+# ----------------------------------------------------------------
+# In the previous template, we manually listed all of the possible values for a
+# knob. This is the lowest level API to define the space, and gives an explicit
+# enumeration of the parameter space to search. However, we also provide
+# another set of APIs that can make the definition of the search space easier
+# and smarter. Where possible, we receomment you use this higher-level API
+#
+# In the following example, we use :any:`ConfigSpace.define_split` to define a
+# split knob. It will enumerate all the possible ways to split an axis and
+# construct the space.
+#
+# We also have :any:`ConfigSpace.define_reorder` for reorder knob and
+# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
+# thread binding. When the high level API cannot meet your requirements, you
+# can always fall back to using the low level API.
+
+
+@autotvm.template("tutorial/matmul")
+def matmul(N, L, M, dtype):
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+
+    k = te.reduce_axis((0, L), name="k")
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
+    s = te.create_schedule(C.op)
+
+    # schedule
+    y, x = s[C].op.axis
+    k = s[C].op.reduce_axis[0]
+
+    ##### define space begin #####
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_y", y, num_outputs=2)
+    cfg.define_split("tile_x", x, num_outputs=2)
+    ##### define space end #####
+
+    # schedule according to config
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+
+    s[C].reorder(yo, xo, k, yi, xi)
+
+    return s, [A, B, C]
+
+
+################################################################################
+# .. note:: More Explanation on :code:`cfg.define_split`
+#
+#  In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will
+#  enumerate all possible combinations that can split axis y into two axes with
+#  factors of the length of y. For example, if the length of y is 32 and we
+#  want to split it into two axes using factors of 32, then there are 6
+#  possible values for (length of outer axis, length of inner axis) pair,
+#  namely (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). These are all 6
+#  possible values of `tile_y`.
+#
+#  During scheduling, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
+#  We stores the lengths of outer axes and inner axes in
+#  :code:`cfg['tile_y'].size` (a tuple with two elements).  In this template,
+#  we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
+#  Actually, this is equivalent to :code:`yo, yi = s[C].split(y,
+#  cfg["tile_y"].size[1])` or  :code:`yo, yi = s[C].split(y,
+#  nparts=cfg['tile_y"].size[0])`
+#
+#  The advantage of using cfg.apply API is that it makes multi-level splits
+#  (that is, when num_outputs >= 3) easier.
+
+################################################################################
+# Step 2: Use AutoTVM to Optimize the Matrix Multiplication
+# ---------------------------------------------------------
+# In Step 1, we wrote a matrix multiplication template that allowed us to
+# paramaterize the block size used in the `split` schedule. We can now conduct
+# a search over this parameter space. The next step is to pick a tuner to guide
+# the exploration of this space.
+#
+# Auto-tuners in TVM
+# ~~~~~~~~~~~~~~~~~~
+# The job for a tuner can be described by following pseudo code
+#
+#   .. code-block:: c
+#
+#    ct = 0
+#    while ct < max_number_of_trials:
+#        propose a batch of configs
+#        measure this batch of configs on real hardware and get results
+#        ct += batch_size
+#
+# When proposing the next batch of configs, the tuner can take different
+# strategies. Some of the tuner strategies provided by TVM include:
+#
+# * :any:`tvm.autotvm.tuner.RandomTuner`: Enumerate the space in a random order
+# * :any:`tvm.autotvm.tuner.GridSearchTuner`: Enumerate the space in a grid search order
+# * :any:`tvm.autotvm.tuner.GATuner`: Using genetic algorithm to search through the space
+# * :any:`tvm.autotvm.tuner.XGBTuner`: Uses a model based method. Train a XGBoost model to
+#   predict the speed of lowered IR and pick the next batch according to the
+#   prediction.
+#
+# You can choose the tuner according to the size of your space, your time
+# budget and other factors.  For example, if your space is very small (less
+# than 1000), a gridsearch tuner or a random tuner is good enough. If your
+# space is at the level of 10^9 (this is the space size of a conv2d operator on
+# CUDA GPU), XGBoostTuner can explore more efficiently and find better configs.
+
+################################################################################
+# Begin tuning
+# ~~~~~~~~~~~~
+# Here we continue our matrix multiplication example. First we create a tuning
+# task. We can also inspect the initialized search space. In this case, for a
+# 512x512 square matrix multiplication, the space size is 10x10=100 Note that
+# the task and search space are independent of the tuner picked.
+
+N, L, M = 512, 512, 512
+task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
+print(task.config_space)
+
+################################################################################
+# Then we need to define how to measure the generated code and pick a tuner.
+# Since our space is small, a random tuner is just okay.
+#
+# We only make 10 trials in this tutorial for demonstration. In practice, you
+# can do more trials according to your time budget. We will log the tuning
+# results into a log file. This file can be used to choose the best
+# configuration discovered by the tuner later.
+
+# logging config (for printing tuning log to the screen)
+logging.getLogger("autotvm").setLevel(logging.DEBUG)
+logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
+
+################################################################################
+# There are two steps for measuring a config: build and run. By default, we use
+# all CPU cores to compile program. We then measure them sequentially. To help
+# reduce variance, we take 5 measurements and average them.
+measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
+
+# Begin tuning with RandomTuner, log records to file `matmul.log`
+# You can use alternatives like XGBTuner.
+tuner = autotvm.tuner.RandomTuner(task)
+tuner.tune(
+    n_trial=10,
+    measure_option=measure_option,
+    callbacks=[autotvm.callback.log_to_file("matmul.log")],
+)
+
+################################################################################
+# With tuning completed, we can choose the configuration from the log file that
+# has the best measured performance and compile the schedule with the
+# corresponding parameters. We also do a quick verfication that the schedule is
+# producing correct answers.  We can call the function :code:`matmul` directly
+# under the :any:`autotvm.apply_history_best` context. When we call this
+# function, it will query the dispatch context with its argument and get the
+# best config with the same argument.
+
+# apply history best from log file
+with autotvm.apply_history_best("matmul.log"):
+    with tvm.target.Target("llvm"):
+        s, arg_bufs = matmul(N, L, M, "float32")
+        func = tvm.build(s, arg_bufs)
+
+# check correctness
+a_np = np.random.uniform(size=(N, L)).astype(np.float32)
+b_np = np.random.uniform(size=(L, M)).astype(np.float32)
+c_np = a_np.dot(b_np)
+
+c_tvm = tvm.nd.empty(c_np.shape)
+func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
+
+tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-4)
+
+################################################################################
+# Final Notes and Summary
+# -----------------------
+# In this tutorial, we have shown how to build operator templates that allow
+# TVM to search a parameter space and choose optimized schedule configurations.
+# To gain a deeper understanding of how this works, we recommend expanding on
+# this example by adding new search parameters to the schedule based on
+# schedule operations demonstated in the `Getting Started With Tensor
+# Expressions <tensor_expr_get_started>_` tutorial. In the upcoming sections, we
+# will demonstate the AutoScheduler, a method for TVM to optimize common
+# operators without the need for the user to provide a user-defined template.
diff --git a/gallery/user_tutorials/autotvm_relay_x86.py b/gallery/user_tutorials/autotvm_relay_x86.py
new file mode 100644
index 000000000000..67faec4505a6
--- /dev/null
+++ b/gallery/user_tutorials/autotvm_relay_x86.py
@@ -0,0 +1,476 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compiling and Optimizing a Model with the Python Interface (AutoTVM)
+====================================================================
+**Author**:
+`Chris Hoge <https://github.com/hogepodge>`_
+
+In the `TVMC Tutorial <tvmc_command_line_driver>`_, we covered how to compile, run, and tune a
+pre-trained vision model, ResNet-50-v2 using the command line interface for
+TVM, TVMC. TVM is more that just a command-line tool though, it is an
+optimizing framework with APIs available for a number of different languages
+that gives you tremendous flexibility in working with machine learning models.
+
+In this tutorial we will cover the same ground we did with TVMC, but show how
+it is done with the Python API. Upon completion of this section, we will have
+used the Python API for TVM to accomplish the following tasks:
+
+* Compile a pre-trained ResNet 50 v2 model for the TVM runtime.
+* Run a real image through the compiled model, and interpret the output and model
+  performance.
+* Tune the model that model on a CPU using TVM.
+* Re-compile an optimized model using the tuning data collected by TVM.
+* Run the image through the optimized model, and compare the output and model
+  performance.
+
+The goal of this section is to give you an overview of TVM's capabilites and
+how to use them through the Python API.
+"""
+
+################################################################################
+# TVM is a deep learning compiler framework, with a number of different modules
+# available for working with deep learning models and operators. In this
+# tutorial we will work through how to load, compile, and optimize a model
+# using the Python API.
+#
+# We begin by importing a number of dependencies, including ``onnx`` for
+# loading and converting the model, helper utilities for downloading test data,
+# the Python Image Library for working with the image data, ``numpy`` for pre
+# and post-processing of the image data, the TVM Relay framework, and the TVM
+# Graph Executor.
+
+import onnx
+from tvm.contrib.download import download_testdata
+from PIL import Image
+import numpy as np
+import tvm.relay as relay
+import tvm
+from tvm.contrib import graph_executor
+
+################################################################################
+# Downloading and Loading the ONNX Model
+# --------------------------------------
+#
+# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a
+# convolutional neural network that is 50-layers deep and designed to classify
+# images. The model we will be using has been pre-trained on more than a
+# million images with 1000 different classifications. The network has an input
+# image size of 224x224. If you are interested exploring more of how the
+# ResNet-50 model is structured, we recommend downloading
+# `Netron <https://netron.app>`_, a freely available ML model viewer.
+#
+# TVM provides a helper library to download pre-trained models. By providing a
+# model URL, file name, and model type through the module, TVM will download
+# the model and save it to disk. For the instance of an ONNX model, you can
+# then load it into memory using the ONNX runtime.
+#
+# .. note:: Working with Other Model Formats
+#
+#   TVM supports many popular model formats. A list can be found in the `Compile
+#   Deep Learning Models
+#   <https://tvm.apache.org/docs/tutorials/index.html#compile-deep-learning-models>`_
+#   section of the TVM Documentation.
+
+model_url = "".join(
+    [
+        "https://github.com/onnx/models/raw/",
+        "master/vision/classification/resnet/model/",
+        "resnet50-v2-7.onnx",
+    ]
+)
+
+model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx")
+onnx_model = onnx.load(model_path)
+
+################################################################################
+# Downloading, Preprocessing, and Loading the Test Image
+# ------------------------------------------------------
+#
+# Each model is particular when it comes to expected tensor shapes, formats and
+# data types. For this reason, most models require some pre and
+# post-processing, to ensure the input is valid and to interpret the output.
+# TVMC has adopted NumPy's ``.npz`` format for both input and output data.
+#
+# As input for this tutorial, we will use the image of a cat, but you can feel
+# free to substitute image for any of your choosing.
+#
+# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg
+#    :height: 224px
+#    :width: 224px
+#    :align: center
+#
+# Download the image data, then convert it to a numpy array to use as an input to the model.
+
+img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
+img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
+
+# Resize it to 224x224
+resized_image = Image.open(img_path).resize((224, 224))
+img_data = np.asarray(resized_image).astype("float32")
+
+# Our input image is in HWC layout while ONNX expects CHW input, so convert the array
+img_data = np.transpose(img_data, (2, 0, 1))
+
+# Normalize according to the ImageNet input specification
+imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev
+
+# Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
+img_data = np.expand_dims(norm_img_data, axis=0)
+
+###############################################################################
+# Compile the Model With Relay
+# ----------------------------
+#
+# The next step is to compile the ResNet model. We begin by importing the model
+# to relay using the `from_onnx` importer. We then build the model, with
+# standard optimizations, into a TVM library.  Finally, we create a TVM graph
+# runtime module from the library.
+
+target = "llvm"
+
+######################################################################
+# .. note:: Defining the Correct Target
+#
+#   Specifying the correct target can have a huge impact on the performance of
+#   the compiled module, as it can take advantage of hardware features
+#   available on the target. For more information, please refer to `Auto-tuning
+#   a convolutional network for x86 CPU
+#   <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
+#   We recommend identifying which CPU you are running, along with optional
+#   features, and set the target appropriately. For example, for some
+#   processors ``target = "llvm -mcpu=skylake"``, or ``target = "llvm
+#   -mcpu=skylake-avx512"`` for processors with the AVX-512 vector instruction
+#   set.
+#
+
+# The input name may vary across model types. You can use a tool
+# like netron to check input names
+input_name = "data"
+shape_dict = {input_name: img_data.shape}
+
+mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
+
+with tvm.transform.PassContext(opt_level=3):
+    lib = relay.build(mod, target=target, params=params)
+
+dev = tvm.device(str(target), 0)
+module = graph_executor.GraphModule(lib["default"](dev))
+
+######################################################################
+# Execute on the TVM Runtime
+# --------------------------
+# Now that we've compiled the model, we can use the TVM runtime to make
+# predictions with it. To use TVM to run the model and make predictions, we
+# need two things:
+#
+# - The compiled model, which we just produced.
+# - Valid input to the model to make predictions on.
+
+dtype = "float32"
+module.set_input(input_name, img_data)
+module.run()
+output_shape = (1, 1000)
+tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
+
+################################################################################
+# Collect Basic Performance Data
+# ------------------------------
+# We want to collect some basic performance data associated with this
+# unoptimized model and compare it to a tuned model later. To help account for
+# CPU noise, we run the computation in multiple batches in multiple
+# repetitions, then gather some basis statistics on the mean, median, and
+# standard deviation.
+import timeit
+
+timing_number = 10
+timing_repeat = 10
+unoptimized = (
+    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
+    * 1000
+    / timing_number
+)
+unoptimized = {
+    "mean": np.mean(unoptimized),
+    "median": np.median(unoptimized),
+    "std": np.std(unoptimized),
+}
+
+print(unoptimized)
+
+################################################################################
+# Postprocess the output
+# ----------------------
+#
+# As previously mentioned, each model will have its own particular way of
+# providing output tensors.
+#
+# In our case, we need to run some post-processing to render the outputs from
+# ResNet-50-V2 into a more human-readable form, using the lookup-table provided
+# for the model.
+
+from scipy.special import softmax
+
+# Download a list of labels
+labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
+labels_path = download_testdata(labels_url, "synset.txt", module="data")
+
+with open(labels_path, "r") as f:
+    labels = [l.rstrip() for l in f]
+
+# Open the output and read the output tensor
+scores = softmax(tvm_output)
+scores = np.squeeze(scores)
+ranks = np.argsort(scores)[::-1]
+for rank in ranks[0:5]:
+    print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
+
+################################################################################
+# This should produce the following output:
+#
+# .. code-block:: bash
+#
+#     # class='n02123045 tabby, tabby cat' with probability=0.610553
+#     # class='n02123159 tiger cat' with probability=0.367179
+#     # class='n02124075 Egyptian cat' with probability=0.019365
+#     # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#     # class='n04040759 radiator' with probability=0.000261
+
+################################################################################
+# Tune the model
+# --------------
+# The previous model was compiled to work on the TVM runtime, but did not
+# include any platform specific optimization. In this section, we will show you
+# how to build an optimized model using TVM to target your working platform.
+#
+# In some cases, we might not get the expected performance when running
+# inferences using our compiled module. In cases like this, we can make use of
+# the auto-tuner, to find a better configuration for our model and get a boost
+# in performance. Tuning in TVM refers to the process by which a model is
+# optimized to run faster on a given target. This differs from training or
+# fine-tuning in that it does not affect the accuracy of the model, but only
+# the runtime performance. As part of the tuning process, TVM will try running
+# many different operator implementation variants to see which perform best.
+# The results of these runs are stored in a tuning records file.
+#
+# In the simplest form, tuning requires you to provide three things:
+#
+# - the target specification of the device you intend to run this model on
+# - the path to an output file in which the tuning records will be stored
+# - a path to the model to be tuned.
+#
+
+import tvm.auto_scheduler as auto_scheduler
+from tvm.autotvm.tuner import XGBTuner
+from tvm import autotvm
+
+# Set up some basic parameters for the runner. The runner takes compiled code
+# that is generated with a specific set of parameters and measures the
+# performance of it. ``number`` specifies the number of different
+# configurations that we will test, while ``repeat`` specifies how many
+# measurements we will take of each configuration. ``min_repeat_ms`` is a value
+# that specifies how long need to run configuration test. If the number of
+# repeats falls under this time, it will be increased. This option is necessary
+# for accurate tuning on GPUs, and is not required for CPU tuning. Setting this
+# value to 0 disables it. The ``timeout`` places an upper limit on how long to
+# run training code for each tested configuration.
+
+number = 10
+repeat = 1
+min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
+timeout = 10  # in seconds
+
+# create a TVM runner
+runner = autotvm.LocalRunner(
+    number=number,
+    repeat=repeat,
+    timeout=timeout,
+    min_repeat_ms=min_repeat_ms,
+    enable_cpu_cache_flush=True,
+)
+
+# Create a simple structure for holding tuning options. We use an XGBoost
+# algorithim for guiding the search. For a production job, you will want to set
+# the number of trials to be larger than the value of 10 used here. For CPU we
+# recommend 1500, for GPU 3000-4000. The number of trials required can depend
+# on the particular model and processor, so it's worth spending some time
+# evaluating performance across a range of values to find the best balance
+# between tuning time and model optimization. Because running tuning is time
+# intensive we set number of trials to 10, but do not recommend a value this
+# small. The ``early_stopping`` parameter is the minimum number of trails to
+# run before a condition that stops the search early can be applied. The
+# measure option indicates where trial code will be built, and where it will be
+# run. In this case, we're using the ``LocalRunner`` we just created and a
+# ``LocalBuilder``. The ``tuning_records`` option specifies a file to write
+# the tuning data to.
+
+tuning_option = {
+    "tuner": "xgb",
+    "trials": 10,
+    "early_stopping": 100,
+    "measure_option": autotvm.measure_option(
+        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
+    ),
+    "tuning_records": "resnet-50-v2-autotuning.json",
+}
+
+################################################################################
+# .. note:: Defining the Tuning Search Algorithm
+#
+#   By default this search is guided using an `XGBoost Grid` algorithm.
+#   Depending on your model complexity and amount of time available, you might
+#   want to choose a different algorithm.
+
+
+################################################################################
+# .. note:: Setting Tuning Parameters
+#
+#   In this example, in the interest of time, we set the number of trials and
+#   early stopping to 10. You will likely see more performance improvements if
+#   you set these values to be higher but this comes at the expense of time
+#   spent tuning. The number of trials required for convergence will vary
+#   depending on the specifics of the model and the target platform.
+
+# begin by extracting the taks from the onnx model
+tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)
+
+# Tune the extracted tasks sequentially.
+for i, task in enumerate(tasks):
+    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+    tuner_obj = XGBTuner(task, loss_type="rank")
+    tuner_obj.tune(
+        n_trial=min(tuning_option["trials"], len(task.config_space)),
+        early_stopping=tuning_option["early_stopping"],
+        measure_option=tuning_option["measure_option"],
+        callbacks=[
+            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
+            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
+        ],
+    )
+
+################################################################################
+# The output from this tuning process will look something like this:
+#
+# .. code-block:: bash
+#
+#   # [Task  1/24]  Current/Best:   10.71/  21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done.
+#   # [Task  1/24]  Current/Best:    9.32/  24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done.
+#   # [Task  2/24]  Current/Best:   22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done.
+#   # [Task  3/24]  Current/Best:   32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done.
+#   # [Task  4/24]  Current/Best:   11.96/ 156.49 GFLOPS | Progress: (960/1000) | 632.26 s Done.
+#   # [Task  5/24]  Current/Best:   23.75/ 130.78 GFLOPS | Progress: (800/1000) | 739.29 s Done.
+#   # [Task  6/24]  Current/Best:   38.29/ 198.31 GFLOPS | Progress: (1000/1000) | 624.51 s Done.
+#   # [Task  7/24]  Current/Best:    4.31/ 210.78 GFLOPS | Progress: (1000/1000) | 701.03 s Done.
+#   # [Task  8/24]  Current/Best:   50.25/ 185.35 GFLOPS | Progress: (972/1000) | 538.55 s Done.
+#   # [Task  9/24]  Current/Best:   50.19/ 194.42 GFLOPS | Progress: (1000/1000) | 487.30 s Done.
+#   # [Task 10/24]  Current/Best:   12.90/ 172.60 GFLOPS | Progress: (972/1000) | 607.32 s Done.
+#   # [Task 11/24]  Current/Best:   62.71/ 203.46 GFLOPS | Progress: (1000/1000) | 581.92 s Done.
+#   # [Task 12/24]  Current/Best:   36.79/ 224.71 GFLOPS | Progress: (1000/1000) | 675.13 s Done.
+#   # [Task 13/24]  Current/Best:    7.76/ 219.72 GFLOPS | Progress: (1000/1000) | 519.06 s Done.
+#   # [Task 14/24]  Current/Best:   12.26/ 202.42 GFLOPS | Progress: (1000/1000) | 514.30 s Done.
+#   # [Task 15/24]  Current/Best:   31.59/ 197.61 GFLOPS | Progress: (1000/1000) | 558.54 s Done.
+#   # [Task 16/24]  Current/Best:   31.63/ 206.08 GFLOPS | Progress: (1000/1000) | 708.36 s Done.
+#   # [Task 17/24]  Current/Best:   41.18/ 204.45 GFLOPS | Progress: (1000/1000) | 736.08 s Done.
+#   # [Task 18/24]  Current/Best:   15.85/ 222.38 GFLOPS | Progress: (980/1000) | 516.73 s Done.
+#   # [Task 19/24]  Current/Best:   15.78/ 203.41 GFLOPS | Progress: (1000/1000) | 587.13 s Done.
+#   # [Task 20/24]  Current/Best:   30.47/ 205.92 GFLOPS | Progress: (980/1000) | 471.00 s Done.
+#   # [Task 21/24]  Current/Best:   46.91/ 227.99 GFLOPS | Progress: (308/1000) | 219.18 s Done.
+#   # [Task 22/24]  Current/Best:   13.33/ 207.66 GFLOPS | Progress: (1000/1000) | 761.74 s Done.
+#   # [Task 23/24]  Current/Best:   53.29/ 192.98 GFLOPS | Progress: (1000/1000) | 799.90 s Done.
+#   # [Task 24/24]  Current/Best:   25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done.
+
+################################################################################
+# Compiling an Optimized Model with Tuning Data
+# ----------------------------------------------
+#
+# As an output of the tuning process above, we obtained the tuning records
+# stored in ``resnet-50-v2-autotuning.json``. The compiler will use the results to
+# generate high performance code for the model on your specified target.
+#
+# Now that tuning data for the model has been collected, we can re-compile the
+# model using optimized operators to speed up our computations.
+
+with autotvm.apply_history_best(tuning_option["tuning_records"]):
+    with tvm.transform.PassContext(opt_level=3, config={}):
+        lib = relay.build(mod, target=target, params=params)
+
+dev = tvm.device(str(target), 0)
+module = graph_executor.GraphModule(lib["default"](dev))
+
+################################################################################
+# Verify that the optimized model runs and produces the same results:
+
+dtype = "float32"
+module.set_input(input_name, img_data)
+module.run()
+output_shape = (1, 1000)
+tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
+
+scores = softmax(tvm_output)
+scores = np.squeeze(scores)
+ranks = np.argsort(scores)[::-1]
+for rank in ranks[0:5]:
+    print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
+
+# Verifying that the predictions are the same:
+#
+# .. code-block:: bash
+#
+#   # class='n02123045 tabby, tabby cat' with probability=0.610550
+#   # class='n02123159 tiger cat' with probability=0.367181
+#   # class='n02124075 Egyptian cat' with probability=0.019365
+#   # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#   # class='n04040759 radiator' with probability=0.000261
+
+################################################################################
+# Comparing the Tuned and Untuned Models
+# --------------------------------------
+# We want to collect some basic performance data associated with this optimized
+# model to compare it to the unoptimized model. Depending on your underlying
+# hardware, number of iterations, and other factors, you should see a performance
+# improvement in comparing the optimized model to the unoptimized model.
+
+import timeit
+
+timing_number = 10
+timing_repeat = 10
+optimized = (
+    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
+    * 1000
+    / timing_number
+)
+optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)}
+
+
+print("optimized: %s" % (optimized))
+print("unoptimized: %s" % (unoptimized))
+
+################################################################################
+# Final Remarks
+# -------------
+#
+# In this tutorial, we gave a short example of how to use the TVM Python API
+# to compile, run, and tune a model. We also discussed the need for pre and
+# post-processing of inputs and outputs. After the tuning process, we
+# demonstrated how to compare the performance of the unoptimized and optimize
+# models.
+#
+# Here we presented a simple example using ResNet 50 V2 locally. However, TVM
+# supports many more features including cross-compilation, remote execution and
+# profiling/benchmarking.
diff --git a/gallery/user_tutorials/cross_compilation_and_rpc.py b/gallery/user_tutorials/cross_compilation_and_rpc.py
new file mode 100644
index 000000000000..25208369f74d
--- /dev/null
+++ b/gallery/user_tutorials/cross_compilation_and_rpc.py
@@ -0,0 +1,265 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-cross-compilation-and-rpc:
+
+Cross Compilation and RPC
+=========================
+**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_, `Lianmin Zheng <https://github.com/merrymercy/>`_
+
+This tutorial introduces cross compilation and remote device
+execution with RPC in TVM.
+
+With cross compilation and RPC, you can **compile a program on your
+local machine then run it on the remote device**. It is useful when
+the remote device resource are limited, like Raspberry Pi and mobile
+platforms. In this tutorial, we will use the Raspberry Pi for a CPU example
+and the Firefly-RK3399 for an OpenCL example.
+"""
+
+######################################################################
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build the TVM runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and the next section should be
+#   executed on the target device, e.g. Raspberry Pi.  We assume the target
+#   is running Linux.
+#
+# Since we do compilation on the local machine, the remote device is only used
+# for running the generated code. We only need to build the TVM runtime on
+# the remote device.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/apache/tvm tvm
+#   cd tvm
+#   make runtime -j2
+#
+# After building the runtime successfully, we need to set environment variables
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in this example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Declare and Cross Compile Kernel on Local Machine
+# -------------------------------------------------
+#
+# .. note::
+#
+#   Now we go back to the local machine, which has a full TVM installed
+#   (with LLVM).
+#
+# Here we will declare a simple kernel on the local machine:
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm import rpc
+from tvm.contrib import utils
+
+n = tvm.runtime.convert(1024)
+A = te.placeholder((n,), name="A")
+B = te.compute((n,), lambda i: A[i] + 1.0, name="B")
+s = te.create_schedule(B.op)
+
+######################################################################
+# Then we cross compile the kernel.
+# The target should be 'llvm -mtriple=armv7l-linux-gnueabihf' for
+# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
+# on our webpage building server. See the detailed note in the following block.
+
+local_demo = True
+
+if local_demo:
+    target = "llvm"
+else:
+    target = "llvm -mtriple=armv7l-linux-gnueabihf"
+
+func = tvm.build(s, [A, B], target=target, name="add_one")
+# save the lib at a local temp folder
+temp = utils.tempdir()
+path = temp.relpath("lib.tar")
+func.export_library(path)
+
+######################################################################
+# .. note::
+#
+#   To run this tutorial with a real remote device, change :code:`local_demo`
+#   to False and replace :code:`target` in :code:`build` with the appropriate
+#   target triple for your device. The target triple which might be
+#   different for different devices. For example, it is
+#   :code:`'llvm -mtriple=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and
+#   :code:`'llvm -mtriple=aarch64-linux-gnu'` for RK3399.
+#
+#   Usually, you can query the target by running :code:`gcc -v` on your
+#   device, and looking for the line starting with :code:`Target:`
+#   (Though it may still be a loose configuration.)
+#
+#   Besides :code:`-mtriple`, you can also set other compilation options
+#   like:
+#
+#   * -mcpu=<cpuname>
+#       Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
+#   * -mattr=a1,+a2,-a3,...
+#       Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU.
+#       To get the list of available attributes, you can do:
+#
+#       .. code-block:: bash
+#
+#         llc -mtriple=<your device target triple> -mattr=help
+#
+#   These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
+#   It is recommended to set target triple and feature set to contain specific
+#   feature available, so we can take full advantage of the features of the
+#   board.
+#   You can find more details about cross compilation attributes from
+#   `LLVM guide of cross compilation <https://clang.llvm.org/docs/CrossCompilation.html>`_.
+
+######################################################################
+# Run CPU Kernel Remotely by RPC
+# ------------------------------
+# We show how to run the generated CPU kernel on the remote device.
+# First we obtain an RPC session from remote device.
+
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = "10.77.1.162"
+    port = 9090
+    remote = rpc.connect(host, port)
+
+######################################################################
+# Upload the lib to the remote device, then invoke a device local
+# compiler to relink them. Now `func` is a remote module object.
+
+remote.upload(path)
+func = remote.load_module("lib.tar")
+
+# create arrays on the remote device
+dev = remote.cpu()
+a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+# the function will run on the remote device
+func(a, b)
+np.testing.assert_equal(b.numpy(), a.numpy() + 1)
+
+######################################################################
+# When you want to evaluate the performance of the kernel on the remote
+# device, it is important to avoid the overhead of network.
+# :code:`time_evaluator` will returns a remote function that runs the
+# function over number times, measures the cost per run on the remote
+# device and returns the measured cost. Network overhead is excluded.
+
+time_f = func.time_evaluator(func.entry_name, dev, number=10)
+cost = time_f(a, b).mean
+print("%g secs/op" % cost)
+
+#########################################################################
+# Run OpenCL Kernel Remotely by RPC
+# ---------------------------------
+# For remote OpenCL devices, the workflow is almost the same as above.
+# You can define the kernel, upload files, and run via RPC.
+#
+# .. note::
+#
+#    Raspberry Pi does not support OpenCL, the following code is tested on
+#    Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
+#    to setup the OS and OpenCL driver for RK3399.
+#
+#    Also we need to build the runtime with OpenCL enabled on rk3399 board. In the TVM
+#    root directory, execute
+#
+# .. code-block:: bash
+#
+#    cp cmake/config.cmake .
+#    sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
+#    make runtime -j4
+#
+# The following function shows how we run an OpenCL kernel remotely
+
+
+def run_opencl():
+    # NOTE: This is the setting for my rk3399 board. You need to modify
+    # them according to your environment.
+    opencl_device_host = "10.77.1.145"
+    opencl_device_port = 9090
+    target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu")
+
+    # create schedule for the above "add one" compute declaration
+    s = te.create_schedule(B.op)
+    xo, xi = s[B].split(B.op.axis[0], factor=32)
+    s[B].bind(xo, te.thread_axis("blockIdx.x"))
+    s[B].bind(xi, te.thread_axis("threadIdx.x"))
+    func = tvm.build(s, [A, B], target=target)
+
+    remote = rpc.connect(opencl_device_host, opencl_device_port)
+
+    # export and upload
+    path = temp.relpath("lib_cl.tar")
+    func.export_library(path)
+    remote.upload(path)
+    func = remote.load_module("lib_cl.tar")
+
+    # run
+    dev = remote.cl()
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
+    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
+    func(a, b)
+    np.testing.assert_equal(b.numpy(), a.numpy() + 1)
+    print("OpenCL test passed!")
+
+
+######################################################################
+# Summary
+# -------
+# This tutorial provides a walk through of cross compilation and RPC
+# features in TVM.
+#
+# - Set up an RPC server on the remote device.
+# - Set up the target device configuration to cross compile the kernels on the
+#   local machine.
+# - Upload and run the kernels remotely via the RPC API.
diff --git a/gallery/user_tutorials/install.py b/gallery/user_tutorials/install.py
new file mode 100644
index 000000000000..b69b8b493a4f
--- /dev/null
+++ b/gallery/user_tutorials/install.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Installing TVM
+==============
+**Authors**:
+`Jocelyn Shiue <https://github.com/>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+Depending on your needs and your working environment, there are a few different
+methods for installing TVM. These include:
+
+* Installing from source
+* Installing from third-party binary package.
+"""
+
+################################################################################
+# Installing from Source
+# ----------------------
+# Installing from source is the recommended method for installing TVM. It will
+# allow you to enable specific features such as GPU support, microcontroller
+# support (microTVM), and a debugging runtime, and other features. You will also
+# want to install from source if you want to actively contribute to the TVM
+# project. The full instructions are on the `Install TVM From Source
+# <https://tvm.apache.org/docs/install/from_source.html>`_ page.
+
+################################################################################
+# Installing From Binary Packages
+# --------------------------------
+# You may install convenient third-party binary package distributions to
+# quickly try things out. TLCPack is a thirdparty volunteer community that
+# builds binary packages from TVM source. It offers support matrix with
+# instructions to install on different platforms, with different features.
+# Checkout  `TLCPack <https://tlcpack.ai>`_ to learn more. Note that the
+# thirdparty binary packages could contain additional licensing terms for
+# the hardware drivers that are bundled with it.
diff --git a/gallery/user_tutorials/introduction.py b/gallery/user_tutorials/introduction.py
new file mode 100644
index 000000000000..0746c3983b61
--- /dev/null
+++ b/gallery/user_tutorials/introduction.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Introduction
+============
+**Authors**:
+`Jocelyn Shiue <https://github.com/>`_,
+`Chris Hoge <https://github.com/hogepodge>`_,
+`Lianmin Zheng <https://github.com/merrymercy>`_
+
+Apache TVM is an open source machine learning compiler framework for CPUs,
+GPUs, and machine learning accelerators. It aims to enable machine learning
+engineers to optimize and run computations efficiently on any hardware backend.
+The purpose of this tutorial is to take a guided tour through all of the major
+features of TVM by defining and demonstrating key concepts. A new user should
+be able to work through the tutorial from start to finish and be able to
+operate TVM for automatic model optimization, while having a basic
+understanding of the TVM architecture and how it works.
+
+Contents
+--------
+
+#. :doc:`Introduction <introduction>`
+#. :doc:`Installing TVM <install>`
+#. :doc:`Compiling and Optimizing a Model with the Command Line Interface <tvmc_command_line_driver>`
+#. :doc:`Compiling and Optimizing a Model with the Python Interface <autotvm_relay_x86>`
+#. :doc:`Working with Operators Using Tensor Expression <tensor_expr_get_started>`
+#. :doc:`Optimizing Operators with Templates and AutoTVM <autotvm_matmul_x86>`
+#. :doc:`Optimizing Operators with Template-free AutoScheduler <auto_scheduler_matmul_x86>`
+#. :doc:`Cross Compilation and Remote Procedure Calls (RPC) <cross_compilation_and_rpc>`
+#. :doc:`Compiling Deep Learning Models for GPUs <relay_quick_start>`
+"""
+
+################################################################################
+# An Overview of TVM and Model Optimization
+# =========================================
+#
+# The diagram below illustrates the steps a machine model takes as it is
+# transformed with the TVM optimizing compiler framework.
+#
+# .. image:: https://raw.githubusercontent.com/apache/tvm-site/main/images/tutorial/overview.png
+#   :width: 100%
+#   :alt: A High Level View of TVM
+#
+# 1. Import the model from a framework like *Tensorflow*, *Pytorch*, or *Onnx*.
+#    The importer layer is where TVM can ingest models from other frameworks, like
+#    Tensorflow, PyTorch, or ONNX. The level of support that TVM offers for each
+#    frontend varies as we are constantly improving the open source project. If
+#    you're having issues importing your model into TVM, you may want to try
+#    converting it to ONNX.
+#
+# 2. Translate to *Relay*, TVM's high-level model language.
+#    A model that has been imported into TVM is represented in Relay. Relay is a
+#    functional language and intermediate representation (IR) for neural networks.
+#    It has support for:
+#
+#    - Traditional data flow-style representations
+#    - Functional-style scoping, let-binding which makes it a fully featured
+#      differentiable language
+#    - Ability to allow the user to mix the two programming styles
+#
+#    Relay applies graph-level optimization passes to optimize the model.
+#
+# 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a
+#    higher-level representation is transformed into a lower-level
+#    representation. After applying the high-level optimizations, Relay
+#    runs FuseOps pass to partition the model into many small subgraphs and lowers
+#    the subgraphs to TE representation. Tensor Expression (TE) is a
+#    domain-specific language for describing tensor computations.
+#    TE also provides several *schedule* primitives to specify low-level loop
+#    optimizations, such as tiling, vectorization, parallelization,
+#    unrolling, and fusion.
+#    To aid in the process of converting Relay representation into TE representation,
+#    TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined
+#    templates of common tensor operators (e.g., conv2d, transpose).
+#
+# 4. Search for the best schedule using the auto-tuning module *AutoTVM* or *AutoScheduler*.
+#    A schedule specifies the low-level loop optimizations for an operator or
+#    subgraph defined in TE. Auto-tuning modules search for the best schedule
+#    and compare them with cost models and on-device measurements.
+#    There are two auto-tuning modules in TVM.
+#
+#    - **AutoTVM**: A template-based auto-tuning module. It runs search algorithms
+#      to find the best values for the tunable knobs in a user-defined template.
+#      For common operators, their templates are already provided in TOPI.
+#    - **AutoScheduler (a.k.a. Ansor)**: A template-free auto-tuning module.
+#      It does not require pre-defined schedule templates. Instead, it generates
+#      the search space automatically by analyzing the computation definition.
+#      It then searches for the best schedule in the generated search space.
+#
+# 5. Choose the optimal configurations for model compilation. After tuning, the
+#    auto-tuning module generates tuning records in JSON format. This step
+#    picks the best schedule for each subgraph.
+#
+# 6. Lower to Tensor Intermediate Representation (TIR), TVM's low-level
+#    intermediate representation. After selecting the optimal configurations
+#    based on the tuning step, each TE subgraph is lowered to TIR and be
+#    optimized by low-level optimization passes. Next, the optimized TIR is
+#    lowered to the target compiler of the hardware platform.
+#    This is the final code generation phase to produce an optimized model
+#    that can be deployed into production. TVM supports several different
+#    compiler backends including:
+#
+#    - LLVM, which can target arbitrary microprocessor architecture including
+#      standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any
+#      other platform supported by LLVM.
+#    - Specialized compilers, such as NVCC, NVIDIA's compiler.
+#    - Embedded and specialized targets, which are implemented through TVM's
+#      Bring Your Own Codegen (BYOC) framework.
+#
+# 7. Compile down to machine code. At the end of this process, the
+#    compiler-specific generated code can be lowered to machine code.
+#
+#    TVM can compile models down to a linkable object module, which can then be
+#    run with a lightweight TVM runtime that provides C APIs to dynamically
+#    load the model, and entry points for other languages such as Python and
+#    Rust. TVM can also build a bundled deployment in which the runtime is
+#    combined with the model in a single package.
+#
+# The remainder of the tutorial will cover these aspects of TVM in more detail.
diff --git a/gallery/user_tutorials/relay_quick_start.py b/gallery/user_tutorials/relay_quick_start.py
new file mode 100644
index 000000000000..fd7f5aa9d756
--- /dev/null
+++ b/gallery/user_tutorials/relay_quick_start.py
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-relay-quick-start:
+
+Quick Start Tutorial for Compiling Deep Learning Models
+=======================================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_
+
+This example shows how to build a neural network with Relay python frontend and
+generates a runtime library for Nvidia GPU with TVM.
+Notice that you need to build TVM with cuda and llvm enabled.
+"""
+
+######################################################################
+# Overview for Supported Hardware Backend of TVM
+# ----------------------------------------------
+# The image below shows hardware backend currently supported by TVM:
+#
+# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tvm_support_list.png
+#      :align: center
+#
+# In this tutorial, we'll choose cuda and llvm as target backends.
+# To begin with, let's import Relay and TVM.
+
+import numpy as np
+
+from tvm import relay
+from tvm.relay import testing
+import tvm
+from tvm import te
+from tvm.contrib import graph_executor
+import tvm.testing
+
+######################################################################
+# Define Neural Network in Relay
+# ------------------------------
+# First, let's define a neural network with relay python frontend.
+# For simplicity, we'll use pre-defined resnet-18 network in Relay.
+# Parameters are initialized with Xavier initializer.
+# Relay also supports other model formats such as MXNet, CoreML, ONNX and
+# Tensorflow.
+#
+# In this tutorial, we assume we will do inference on our device and
+# the batch size is set to be 1. Input images are RGB color images of
+# size 224 * 224. We can call the
+# :py:meth:`tvm.relay.expr.TupleWrapper.astext()` to show the network
+# structure.
+
+batch_size = 1
+num_class = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+out_shape = (batch_size, num_class)
+
+mod, params = relay.testing.resnet.get_workload(
+    num_layers=18, batch_size=batch_size, image_shape=image_shape
+)
+
+# set show_meta_data=True if you want to show meta data
+print(mod.astext(show_meta_data=False))
+
+######################################################################
+# Compilation
+# -----------
+# Next step is to compile the model using the Relay/TVM pipeline.
+# Users can specify the optimization level of the compilation.
+# Currently this value can be 0 to 3. The optimization passes include
+# operator fusion, pre-computation, layout transformation and so on.
+#
+# :py:func:`relay.build` returns three components: the execution graph in
+# json format, the TVM module library of compiled functions specifically
+# for this graph on the target hardware, and the parameter blobs of
+# the model. During the compilation, Relay does the graph-level
+# optimization while TVM does the tensor-level optimization, resulting
+# in an optimized runtime module for model serving.
+#
+# We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build`
+# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
+# then registers the operators (i.e. the nodes of the optimized graphs) to
+# TVM implementations to generate a `tvm.module`.
+# To generate the module library, TVM will first transfer the high level IR
+# into the lower intrinsic IR of the specified target backend, which is CUDA
+# in this example. Then the machine code will be generated as the module library.
+
+opt_level = 3
+target = tvm.target.cuda()
+with tvm.transform.PassContext(opt_level=opt_level):
+    lib = relay.build(mod, target, params=params)
+
+#####################################################################
+# Run the generate library
+# ------------------------
+# Now we can create graph executor and run the module on Nvidia GPU.
+
+# create random input
+dev = tvm.cuda()
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+# create module
+module = graph_executor.GraphModule(lib["default"](dev))
+# set input and parameters
+module.set_input("data", data)
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape)).numpy()
+
+# Print first 10 elements of output
+print(out.flatten()[0:10])
+
+######################################################################
+# Save and Load Compiled Module
+# -----------------------------
+# We can also save the graph, lib and parameters into files and load them
+# back in deploy environment.
+
+####################################################
+
+# save the graph, lib and params into separate files
+from tvm.contrib import utils
+
+temp = utils.tempdir()
+path_lib = temp.relpath("deploy_lib.tar")
+lib.export_library(path_lib)
+print(temp.listdir())
+
+####################################################
+
+# load the module back.
+loaded_lib = tvm.runtime.load_module(path_lib)
+input_data = tvm.nd.array(data)
+
+module = graph_executor.GraphModule(loaded_lib["default"](dev))
+module.run(data=input_data)
+out_deploy = module.get_output(0).numpy()
+
+# Print first 10 elements of output
+print(out_deploy.flatten()[0:10])
+
+# check whether the output from deployed module is consistent with original one
+tvm.testing.assert_allclose(out_deploy, out, atol=1e-5)
diff --git a/gallery/user_tutorials/tensor_expr_get_started.py b/gallery/user_tutorials/tensor_expr_get_started.py
new file mode 100644
index 000000000000..310d6bdbfee4
--- /dev/null
+++ b/gallery/user_tutorials/tensor_expr_get_started.py
@@ -0,0 +1,903 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-tensor-expr-get-started:
+
+Working with Operators Using Tensor Expression
+==============================================
+**Author**: `Tianqi Chen <https://tqchen.github.io>`_
+
+In this tutorial we will turn our attention to how TVM works with Tensor
+Expression (TE) to define tensor computations and apply loop optimizations. TE
+describes tensor computations in a pure functional language (that is each
+expression has no side effects). When viewed in context of the TVM as a whole,
+Relay describes a computation as a set of operators, and each of these
+operators can be represented as a TE expression where each TE expression takes
+input tensors and produces an output tensor.
+
+This is an introductory tutorial to the Tensor Expression language in TVM. TVM
+uses a domain specific tensor expression for efficient kernel construction. We
+will demonstrate the basic workflow with two examples of using the tensor expression
+language. The first example introduces TE and scheduling with vector
+addition. The second expands on these concepts with a step-by-step optimization
+of a matrix multiplication with TE. This matrix multiplication example will
+serve as the comparative basis for future tutorials covering more advanced
+features of TVM.
+"""
+
+################################################################################
+# Example 1: Writing and Scheduling Vector Addition in TE for CPU
+# ---------------------------------------------------------------
+#
+# Let's look at an example in Python in which we will implement a TE for
+# vector addition, followed by a schedule targeted towards a CPU.
+# We begin by initializing a TVM environment.
+
+import tvm
+import tvm.testing
+from tvm import te
+import numpy as np
+
+# You will get better performance if you can identify the CPU you are targeting
+# and specify it. If you're using llvm, you can get this information from the
+# command ``llc --version`` to get the CPU type, and you can check
+# ``/proc/cpuinfo`` for additional extensions that your processor might
+# support. For example, you can use "llvm -mcpu=skylake-avx512" for CPUs with
+# AVX-512 instructions.
+
+tgt = tvm.target.Target(target="llvm", host="llvm")
+
+################################################################################
+# Describing the Vector Computation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# We describe a vector addition computation. TVM adopts tensor semantics, with
+# each intermediate result represented as a multi-dimensional array. The user
+# needs to describe the computation rule that generates the tensors. We first
+# define a symbolic variable ``n`` to represent the shape. We then define two
+# placeholder Tensors, ``A`` and ``B``, with given shape ``(n,)``. We then
+# describe the result tensor ``C``, with a ``compute`` operation. The
+# ``compute`` defines a computation, with the output conforming to the
+# specified tensor shape and the computation to be performed at each position
+# in the tensor defined by the lambda function. Note that while ``n`` is a
+# variable, it defines a consistent shape between the ``A``, ``B`` and ``C``
+# tensors. Remember, no actual computation happens during this phase, as we
+# are only declaring how the computation should be done.
+
+n = te.var("n")
+A = te.placeholder((n,), name="A")
+B = te.placeholder((n,), name="B")
+C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+################################################################################
+# .. note:: Lambda Functions
+#
+#   The second argument to the ``te.compute`` method is the function that
+#   performs the computation. In this example, we're using an anonymous function,
+#   also known as a ``lambda`` function, to define the computation, in this case
+#   addition on the ``i``th element of ``A`` and ``B``.
+
+################################################################################
+# Create a Default Schedule for the Computation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# While the above lines describe the computation rule, we can compute ``C`` in
+# many different ways to fit different devices. For a tensor with multiple
+# axes, you can choose which axis to iterate over first, or computations can be
+# split across different threads. TVM requires that the user to provide a
+# schedule, which is a description of how the computation should be performed.
+# Scheduling operations within TE can change loop orders, split computations
+# across different threads, group blocks of data together, amongst other
+# operations. An important concept behind schedules is that they only describe
+# how the computation is performed, so different schedules for the same TE will
+# produce the same result.
+#
+# TVM allows you to create a naive schedule that will compute ``C`` in by
+# iterating in row major order.
+#
+# .. code-block:: c
+#
+#   for (int i = 0; i < n; ++i) {
+#     C[i] = A[i] + B[i];
+#   }
+
+s = te.create_schedule(C.op)
+
+######################################################################
+# Compile and Evaluate the Default Schedule
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# With the TE expression and a schedule, we can produce runnable code for our
+# target language and architecture, in this case LLVM and a CPU. We provide
+# TVM with the schedule, a list of the TE expressions that are in the schedule,
+# the target and host, and the name of the function we are producing. The result
+# of the output is a type-erased function that can be called directly from Python.
+#
+# In the following line, we use tvm.build to create a function. The build
+# function takes the schedule, the desired signature of the function (including
+# the inputs and outputs) as well as target language we want to compile to.
+
+fadd = tvm.build(s, [A, B, C], tgt, name="myadd")
+
+################################################################################
+# Let's run the function, and compare the output to the same computation in
+# numpy. The compiled TVM function is exposes a concise C API that can be invoked
+# from any language. We begin by creating a device, which is a device (CPU in this
+# example) that TVM can compile the schedule to. In this case the device is an
+# LLVM CPU target. We can then initialize the tensors in our device and
+# perform the custom addition operation. To verify that the computation is
+# correct, we can compare the result of the output of the c tensor to the same
+# computation performed by numpy.
+
+dev = tvm.device(tgt.kind.name, 0)
+
+n = 1024
+a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+fadd(a, b, c)
+tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+
+################################################################################
+# To get a comparison of how fast this version is compared to numpy, create a
+# helper function to run a profile of the TVM generated code.
+import timeit
+
+np_repeat = 100
+np_running_time = timeit.timeit(
+    setup="import numpy\n"
+    "n = 32768\n"
+    'dtype = "float32"\n'
+    "a = numpy.random.rand(n, 1).astype(dtype)\n"
+    "b = numpy.random.rand(n, 1).astype(dtype)\n",
+    stmt="answer = a + b",
+    number=np_repeat,
+)
+print("Numpy running time: %f" % (np_running_time / np_repeat))
+
+
+def evaluate_addition(func, target, optimization, log):
+    dev = tvm.device(target.kind.name, 0)
+    n = 32768
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+
+    evaluator = func.time_evaluator(func.entry_name, dev, number=10)
+    mean_time = evaluator(a, b, c).mean
+    print("%s: %f" % (optimization, mean_time))
+
+    log.append((optimization, mean_time))
+
+
+log = [("numpy", np_running_time / np_repeat)]
+evaluate_addition(fadd, tgt, "naive", log=log)
+
+################################################################################
+# Updating the Schedule to Use Paralleism
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Now that we've illustrated the fundamentals of TE, let's go deeper into what
+# schedules do, and how they can be used to optimize tensor expressions for
+# different architectures. A schedule is a series of steps that are applied to
+# an expression to transform it in a number of different ways. When a schedule
+# is applied to an expression in TE, the inputs and outputs remain the same,
+# but when compiled the implementation of the expression can change. This
+# tensor addition, in the default schedule, is run serially but is easy to
+# parallelize across all of the processor threads. We can apply the parallel
+# schedule operation to our computation.
+
+s[C].parallel(C.op.axis[0])
+
+################################################################################
+# The ``tvm.lower`` command will generate the Intermediate Representation (IR)
+# of the TE, with the corresponding schedule. By lowering the expression as we
+# apply different schedule operations, we can see the effect of scheduling on
+# the ordering of the computation. We use the flag ``simple_mode=True`` to
+# return a readable C-style statement.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# It's now possible for TVM to run these blocks on independent threads. Let's
+# compile and run this new schedule with the parallel operation applied:
+
+fadd_parallel = tvm.build(s, [A, B, C], tgt, name="myadd_parallel")
+fadd_parallel(a, b, c)
+
+tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+
+evaluate_addition(fadd_parallel, tgt, "parallel", log=log)
+
+################################################################################
+# Updating the Schedule to Use Vectorization
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Modern CPUs also have the ability to perform SIMD operations on floating
+# point values, and we can apply another schedule to our computation expression
+# to take advantage of this. Accomplishing this requires multiple steps: first
+# we have to split the schedule into inner and outer loops using the split
+# scheduling primitive. The inner loops can use vectorization to use SIMD
+# instructions using the vectorize scheduling primitive, then the outer loops
+# can be parallelized using the parallel scheduling primitive. Choose the split
+# factor to be the number of threads on your CPU.
+
+# Recreate the schedule, since we modified it with the parallel operation in
+# the previous example
+n = te.var("n")
+A = te.placeholder((n,), name="A")
+B = te.placeholder((n,), name="B")
+C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+s = te.create_schedule(C.op)
+
+# This factor should be chosen to match the number of threads appropriate for
+# your CPU. This will vary depending on architecture, but a good rule is
+# setting this factor to equal the number of available CPU cores.
+factor = 4
+
+outer, inner = s[C].split(C.op.axis[0], factor=factor)
+s[C].parallel(outer)
+s[C].vectorize(inner)
+
+fadd_vector = tvm.build(s, [A, B, C], tgt, name="myadd_parallel")
+
+evaluate_addition(fadd_vector, tgt, "vector", log=log)
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Comparing the Diferent Schedules
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# We can now compare the different schedules
+
+baseline = log[0][1]
+print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20)))
+for result in log:
+    print(
+        "%s\t%s\t%s"
+        % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20))
+    )
+
+
+################################################################################
+# .. note:: Code Specialization
+#
+#   As you may have noticed, the declarations of ``A``, ``B`` and ``C`` all
+#   take the same shape argument, ``n``. TVM will take advantage of this to
+#   pass only a single shape argument to the kernel, as you will find in the
+#   printed device code. This is one form of specialization.
+#
+#   On the host side, TVM will automatically generate check code that checks
+#   the constraints in the parameters. So if you pass arrays with different
+#   shapes into fadd, an error will be raised.
+#
+#   We can do more specializations. For example, we can write :code:`n =
+#   tvm.runtime.convert(1024)` instead of :code:`n = te.var("n")`, in the
+#   computation declaration. The generated function will only take vectors with
+#   length 1024.
+
+################################################################################
+# We've defined, scheduled, and compiled a vector addition operator, which we
+# were then able to execute on the TVM runtime. We can save the operator as a
+# library, which we can then load later using the TVM runtime.
+
+################################################################################
+# Targeting Vector Addition for GPUs (Optional)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# TVM is capable of targeting multiple architectures. In the next example, we
+# will target compilation of the vector addition to GPUs.
+
+# If you want to run this code, change ``run_cuda = True``
+# Note that by default this example is not run in the docs CI.
+
+run_cuda = False
+if run_cuda:
+    # Change this target to the correct backend for you gpu. For example: cuda (NVIDIA GPUs),
+    # rocm (Radeon GPUS), OpenCL (opencl).
+    tgt_gpu = tvm.target.Target(target="cuda", host="llvm")
+
+    # Recreate the schedule
+    n = te.var("n")
+    A = te.placeholder((n,), name="A")
+    B = te.placeholder((n,), name="B")
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    print(type(C))
+
+    s = te.create_schedule(C.op)
+
+    bx, tx = s[C].split(C.op.axis[0], factor=64)
+
+    ################################################################################
+    # Finally we must bind the iteration axis bx and tx to threads in the GPU
+    # compute grid. The naive schedule is not valid for GPUs, and these are
+    # specific constructs that allow us to generate code that runs on a GPU.
+
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+
+    ######################################################################
+    # Compilation
+    # -----------
+    # After we have finished specifying the schedule, we can compile it
+    # into a TVM function. By default TVM compiles into a type-erased
+    # function that can be directly called from the python side.
+    #
+    # In the following line, we use tvm.build to create a function.
+    # The build function takes the schedule, the desired signature of the
+    # function (including the inputs and outputs) as well as target language
+    # we want to compile to.
+    #
+    # The result of compilation fadd is a GPU device function (if GPU is
+    # involved) as well as a host wrapper that calls into the GPU
+    # function. fadd is the generated host wrapper function, it contains
+    # a reference to the generated device function internally.
+
+    fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name="myadd")
+
+    ################################################################################
+    # The compiled TVM function is exposes a concise C API that can be invoked from
+    # any language.
+    #
+    # We provide a minimal array API in python to aid quick testing and prototyping.
+    # The array API is based on the `DLPack <https://github.com/dmlc/dlpack>`_ standard.
+    #
+    # - We first create a GPU device.
+    # - Then tvm.nd.array copies the data to the GPU.
+    # - ``fadd`` runs the actual computation
+    # - ``numpy()`` copies the GPU array back to the CPU (so we can verify correctness).
+    #
+    # Note that copying the data to and from the memory on the GPU is a required step.
+
+    dev = tvm.device(tgt_gpu.kind.name, 0)
+
+    n = 1024
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+    fadd(a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+
+    ################################################################################
+    # Inspect the Generated GPU Code
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # You can inspect the generated code in TVM. The result of tvm.build is a TVM
+    # Module. fadd is the host module that contains the host wrapper, it also
+    # contains a device module for the CUDA (GPU) function.
+    #
+    # The following code fetches the device module and prints the content code.
+
+    if (
+        tgt_gpu.kind.name == "cuda"
+        or tgt_gpu.kind.name == "rocm"
+        or tgt_gpu.kind.name.startswith("opencl")
+    ):
+        dev_module = fadd.imported_modules[0]
+        print("-----GPU code-----")
+        print(dev_module.get_source())
+    else:
+        print(fadd.get_source())
+
+################################################################################
+# Saving and Loading Compiled Modules
+# -----------------------------------
+# Besides runtime compilation, we can save the compiled modules into a file and
+# load them back later.
+#
+# The following code first performs the following steps:
+#
+# - It saves the compiled host module into an object file.
+# - Then it saves the device module into a ptx file.
+# - cc.create_shared calls a compiler (gcc) to create a shared library
+
+from tvm.contrib import cc
+from tvm.contrib import utils
+
+temp = utils.tempdir()
+fadd.save(temp.relpath("myadd.o"))
+if tgt.kind.name == "cuda":
+    fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
+if tgt.kind.name == "rocm":
+    fadd.imported_modules[0].save(temp.relpath("myadd.hsaco"))
+if tgt.kind.name.startswith("opencl"):
+    fadd.imported_modules[0].save(temp.relpath("myadd.cl"))
+cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
+print(temp.listdir())
+
+################################################################################
+# .. note:: Module Storage Format
+#
+#   The CPU (host) module is directly saved as a shared library (.so). There
+#   can be multiple customized formats of the device code. In our example, the
+#   device code is stored in ptx, as well as a meta data json file. They can be
+#   loaded and linked separately via import.
+
+################################################################################
+# Load Compiled Module
+# ~~~~~~~~~~~~~~~~~~~~
+# We can load the compiled module from the file system and run the code. The
+# following code loads the host and device module separately and links them
+# together. We can verify that the newly loaded function works.
+
+fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so"))
+if tgt.kind.name == "cuda":
+    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx"))
+    fadd1.import_module(fadd1_dev)
+
+if tgt.kind.name == "rocm":
+    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco"))
+    fadd1.import_module(fadd1_dev)
+
+if tgt.kind.name.startswith("opencl"):
+    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl"))
+    fadd1.import_module(fadd1_dev)
+
+fadd1(a, b, c)
+tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+
+################################################################################
+# Pack Everything into One Library
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# In the above example, we store the device and host code separately. TVM also
+# supports export everything as one shared library. Under the hood, we pack
+# the device modules into binary blobs and link them together with the host
+# code. Currently we support packing of Metal, OpenCL and CUDA modules.
+
+fadd.export_library(temp.relpath("myadd_pack.so"))
+fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so"))
+fadd2(a, b, c)
+tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+
+################################################################################
+# .. note:: Runtime API and Thread-Safety
+#
+#   The compiled modules of TVM do not depend on the TVM compiler. Instead,
+#   they only depend on a minimum runtime library. The TVM runtime library
+#   wraps the device drivers and provides thread-safe and device agnostic calls
+#   into the compiled functions.
+#
+#   This means that you can call the compiled TVM functions from any thread, on
+#   any GPUs, provided that you have compiled the code for that GPU.
+
+################################################################################
+# Generate OpenCL Code
+# --------------------
+# TVM provides code generation features into multiple backends. We can also
+# generate OpenCL code or LLVM code that runs on CPU backends.
+#
+# The following code blocks generate OpenCL code, creates array on an OpenCL
+# device, and verifies the correctness of the code.
+
+if tgt.kind.name.startswith("opencl"):
+    fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
+    print("------opencl code------")
+    print(fadd_cl.imported_modules[0].get_source())
+    dev = tvm.cl(0)
+    n = 1024
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+    fadd_cl(a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+
+################################################################################
+# .. note:: TE Scheduling Primitives
+#
+#   TVM includes a number of different scheduling primitives:
+#
+#   - split: splits a specified axis into two axises by the defined factor.
+#   - tile: tiles will split a computation across two axes by the defined factors.
+#   - fuse: fuses two consecutive axises of one computation.
+#   - reorder: can reorder the axises of a computation into a defined order.
+#   - bind: can bind a computation to a specific thread, useful in GPU programming.
+#   - compute_at: by default, TVM will compute tensors at the outermost level
+#     of the function, or the root, by default. compute_at specifies that one
+#     tensor should be computed at the first axis of computation for another
+#     operator.
+#   - compute_inline: when marked inline, a computation will be expanded then
+#     inserted into the address where the tensor is required.
+#   - compute_root: moves a computation to the outermost layer, or root, of the
+#     function. This means that stage of the computation will be fully computed
+#     before it moves on to the next stage.
+#
+#   A complete description of these primitives can be found in the
+#   [Schedule Primitives](https://tvm.apache.org/docs/tutorials/language/schedule_primitives.html) docs page.
+
+################################################################################
+# Example 2: Manually Optimizing Matrix Multiplication with TE
+# ------------------------------------------------------------
+#
+# Now we will consider a second, more advanced example, demonstrating how with
+# just 18 lines of python code TVM speeds up a common matrix multiplication operation by 18x.
+#
+# **Matrix multiplication is a compute intensive operation. There are
+# two important optimizations for good CPU performance:**
+#
+# 1. Increase the cache hit rate of memory access. Both complex
+#    numerical computation and hot-spot memory access can be
+#    accelerated by a high cache hit rate. This requires us to
+#    transform the origin memory access pattern to a pattern that fits
+#    the cache policy.
+#
+# 2. SIMD (Single instruction multi-data), also known as the vector
+#    processing unit. On each cycle instead of processing a single
+#    value, SIMD can process a small batch of data.  This requires us
+#    to transform the data access pattern in the loop body in uniform
+#    pattern so that the LLVM backend can lower it to SIMD.
+#
+# The techniques used in this tutorial are a subset of tricks mentioned in this
+# `repository <https://github.com/flame/how-to-optimize-gemm>`_. Some of them
+# have been applied by TVM abstraction automatically, but some of them cannot
+# be automatically applied due to TVM constraints.
+
+################################################################################
+# Preparation and Performance Baseline
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We begin by collecting performance data on the `numpy` implementation of
+# matrix multiplication.
+
+import tvm
+import tvm.testing
+from tvm import te
+import numpy
+
+# The size of the matrix
+# (M, K) x (K, N)
+# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.
+M = 1024
+K = 1024
+N = 1024
+
+# The default tensor data type in tvm
+dtype = "float32"
+
+# You will want to adjust the target to match any CPU vector extensions you
+# might have. For example, if you're using using Intel AVX2 (Advanced Vector
+# Extensions) ISA for SIMD, you can get the best performance by changing the
+# following line to ``llvm -mcpu=core-avx2``, or specific type of CPU you use.
+# Recall that you're using llvm, you can get this information from the command
+# ``llc --version`` to get the CPU type, and you can check ``/proc/cpuinfo``
+# for additional extensions that your processor might support.
+
+target = tvm.target.Target(target="llvm", host="llvm")
+dev = tvm.device(target.kind.name, 0)
+
+# Random generated tensor for testing
+a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev)
+b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev)
+
+# Repeatedly perform a matrix multiplication to get a performance baseline
+# for the default numpy implementation
+np_repeat = 100
+np_running_time = timeit.timeit(
+    setup="import numpy\n"
+    "M = " + str(M) + "\n"
+    "K = " + str(K) + "\n"
+    "N = " + str(N) + "\n"
+    'dtype = "float32"\n'
+    "a = numpy.random.rand(M, K).astype(dtype)\n"
+    "b = numpy.random.rand(K, N).astype(dtype)\n",
+    stmt="answer = numpy.dot(a, b)",
+    number=np_repeat,
+)
+print("Numpy running time: %f" % (np_running_time / np_repeat))
+
+answer = numpy.dot(a.numpy(), b.numpy())
+
+################################################################################
+# Now we write a basic matrix multiplication using TVM TE and verify that it
+# produces the same results as the numpy implementation. We also write a
+# function that will help us measure the performance of the schedule
+# optimizations.
+
+# TVM Matrix Multiplication using TE
+k = te.reduce_axis((0, K), "k")
+A = te.placeholder((M, K), name="A")
+B = te.placeholder((K, N), name="B")
+C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")
+
+# Default schedule
+s = te.create_schedule(C.op)
+func = tvm.build(s, [A, B, C], target=target, name="mmult")
+
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
+func(a, b, c)
+tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5)
+
+
+def evaluate_operation(s, vars, target, name, optimization, log):
+    func = tvm.build(s, [A, B, C], target=target, name="mmult")
+    assert func
+
+    c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
+    func(a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5)
+
+    evaluator = func.time_evaluator(func.entry_name, dev, number=10)
+    mean_time = evaluator(a, b, c).mean
+    print("%s: %f" % (optimization, mean_time))
+    log.append((optimization, mean_time))
+
+
+log = []
+
+evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="none", log=log)
+
+################################################################################
+# Let's take a look at the intermediate representation of the operator and
+# default schedule using the TVM lower function. Note how the implementation is
+# essentially a naive implementation of a matrix multiplication, using three
+# nested loops over the indices of the A and B matrices.
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Optimization 1: Blocking
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# A important trick to enhance the cache hit rate is blocking, where you
+# structure memory access such that the inside a block is a small neighborhood
+# that has high memory locality. In this tutorial, we pick a block factor of
+# 32. This will result in a block that will fill a 32 * 32 * sizeof(float) area
+# of memory. This corresponds to a cache size of 4KB, in relation to a
+# reference cache size of 32 KB for L1 cache.
+#
+# We begin by creating a default schedule for the ``C`` operation, then apply a
+# ``tile`` scheduling primitive to it with the specified block factor, with the
+# scheduling primitive returning the resulting loop order from outermost to
+# innermost, as a vector ``[x_outer, y_outer, x_inner, y_inner]``. We then get
+# the reduction axis for output of the operation, and perform a split operation
+# on it using a factor of 4. This factor doesn't directly impact the blocking
+# optimization we're working on right now, but will be useful later when we
+# apply vectorization.
+#
+# Now that the operation has been blocked, we can reorder the computation to
+# put the reduction operation into the outermost loop of the computation,
+# helping to guarantee that the blocked data remains in cache. This completes
+# the schedule, and we can build and test the performance compared to the naive
+# schedule.
+
+bn = 32
+
+# Blocking by loop tiling
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+(k,) = s[C].op.reduce_axis
+ko, ki = s[C].split(k, factor=4)
+
+# Hoist reduction domain outside the blocking loop
+s[C].reorder(xo, yo, ko, ki, xi, yi)
+
+evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="blocking", log=log)
+
+################################################################################
+# By reordering the computation to take advantage of caching, you should see a
+# significant improvement in the performance of the computation. Now, print the
+# internal representation and compare it to the original:
+
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Optimization 2: Vectorization
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Another important optimization trick is vectorization. When the memory access
+# pattern is uniform, the compiler can detect this pattern and pass the
+# continuous memory to the SIMD vector processor. In TVM, we can use the
+# ``vectorize`` interface to hint the compiler this pattern, taking advantage
+# of this hardware feature.
+#
+# In this tutorial, we chose to vectorize the inner loop row data since it is
+# already cache friendly from our previous optimizations.
+
+# Apply the vectorization optimization
+s[C].vectorize(yi)
+
+evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="vectorization", log=log)
+
+# The generalized IR after vectorization
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Optimization 3: Loop Permutation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# If we look at the above IR, we can see the inner loop row data is vectorized
+# and B is transformed into PackedB (this is evident by the `(float32x32*)B2`
+# portion of the inner loop). The traversal of PackedB is sequential now. So we
+# will look at the access pattern of A. In current schedule, A is accessed
+# column by column which is not cache friendly. If we change the nested loop
+# order of `ki` and inner axes `xi`, the access pattern for A matrix will be
+# more cache friendly.
+
+s = te.create_schedule(C.op)
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+(k,) = s[C].op.reduce_axis
+ko, ki = s[C].split(k, factor=4)
+
+# re-ordering
+s[C].reorder(xo, yo, ko, xi, ki, yi)
+s[C].vectorize(yi)
+
+evaluate_operation(
+    s, [A, B, C], target=target, name="mmult", optimization="loop permutation", log=log
+)
+
+# Again, print the new generalized IR
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Optimization 4: Array Packing
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Another important trick is array packing. This trick is to reorder the
+# storage dimension of the array to convert the continuous access pattern on
+# certain dimension to a sequential pattern after flattening.
+#
+# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png
+#    :align: center
+#
+# Just as it is shown in the figure above, after blocking the computations, we
+# can observe the array access pattern of B (after flattening), which is
+# regular but discontinuous. We expect that after some transformation we can
+# get a continuous access pattern. By reordering a ``[16][16]`` array to a
+# ``[16/4][16][4]`` array the access pattern of B will be sequential when
+# grabing the corresponding value from the packed array.
+#
+# To accomplish this, we are going to have to start with a new default
+# schedule, taking into account the new packing of B. It's worth taking a
+# moment to comment on this: TE is a powerful and expressive language for
+# writing optimized operators, but it often requires some knowledge of the
+# underlying algorithm, data structures, and hardware target that you are
+# writing for. Later in the tutorial, we will discuss some of the options for
+# letting TVM take that burden. Regardless, let's move on with the new
+# optimized schedule.
+
+# We have to re-write the algorithm slightly.
+packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB")
+C = te.compute(
+    (M, N),
+    lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
+    name="C",
+)
+
+s = te.create_schedule(C.op)
+
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+(k,) = s[C].op.reduce_axis
+ko, ki = s[C].split(k, factor=4)
+
+s[C].reorder(xo, yo, ko, xi, ki, yi)
+s[C].vectorize(yi)
+
+x, y, z = s[packedB].op.axis
+s[packedB].vectorize(z)
+s[packedB].parallel(x)
+
+evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="array packing", log=log)
+
+# Here is the generated IR after array packing.
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Optimization 5: Optimizing Block Writing Through Caching
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Up to this point all of our optimizations have focused on efficiently
+# accessing and computing the data from the `A` and `B` matrices to compute the
+# `C` matrix. After the blocking optimization, the operator will write result
+# to `C` block by block, and the access pattern is not sequential. We can
+# address this by using a sequential cache array, using a combination of
+# `cache_write`, `compute_at`, and `unroll`to hold the block results and write
+# to `C` when all the block results are ready.
+
+s = te.create_schedule(C.op)
+
+# Allocate write cache
+CC = s.cache_write(C, "global")
+
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+
+# Write cache is computed at yo
+s[CC].compute_at(s[C], yo)
+
+# New inner axes
+xc, yc = s[CC].op.axis
+
+(k,) = s[CC].op.reduce_axis
+ko, ki = s[CC].split(k, factor=4)
+s[CC].reorder(ko, xc, ki, yc)
+s[CC].unroll(ki)
+s[CC].vectorize(yc)
+
+x, y, z = s[packedB].op.axis
+s[packedB].vectorize(z)
+s[packedB].parallel(x)
+
+evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="block caching", log=log)
+
+# Here is the generated IR after write cache blocking.
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Optimization 6: Parallelization
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# So far, our computation is only designed to use a single core. Nearly all
+# modern processors have multiple cores, and computation can benefit from
+# running computations in parallel. The final optimization is to take advantage
+# of thread-level parallelization.
+
+# parallel
+s[C].parallel(xo)
+
+x, y, z = s[packedB].op.axis
+s[packedB].vectorize(z)
+s[packedB].parallel(x)
+
+evaluate_operation(
+    s, [A, B, C], target=target, name="mmult", optimization="parallelization", log=log
+)
+
+# Here is the generated IR after parallelization.
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+################################################################################
+# Summary of Matrix Multiplication Example
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# After applying the above simple optimizations with only 18 lines of code, our
+# generated code can begin to approach the performance of `numpy` with the Math
+# Kernel Library (MKL). Since we've been logging the performance as we've been
+# working, we can compare the results.
+
+baseline = log[0][1]
+print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20)))
+for result in log:
+    print(
+        "%s\t%s\t%s"
+        % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20))
+    )
+
+################################################################################
+# Note that the outputs on the web page reflect the running times on a
+# non-exclusive Docker container, and should be considered unreliable. It is
+# highly encouraged to run the tutorial by yourself to observe the performance
+# gain achieved by TVM, and to carefully work through each example to
+# understand the iterative improvements that are made to the matrix
+# multiplication operation.
+
+################################################################################
+# Final Notes and Summary
+# -----------------------
+# As mentioned earlier, how to apply optimizations using TE and scheduling
+# primitives can require some knowledge of the underlying architecture and
+# algorithms. However, TE was designed to act as a foundation for more complex
+# algorithms that can search the potential optimization. With the knowledge you
+# have from this introduction to TE, we can now begin to explore how TVM can
+# automate the schedule optimization process.
+#
+# This tutorial provided a walkthrough of TVM Tensor Expresstion (TE) workflow
+# using a vector add and a matrix multiplication examples. The general workflow
+# is
+#
+# - Describe your computation via a series of operations.
+# - Describe how we want to compute use schedule primitives.
+# - Compile to the target function we want.
+# - Optionally, save the function to be loaded later.
+#
+# Upcoming tutorials expand on the matrix multiplication example, and show how
+# you can build generic templates of the matrix multiplication and other
+# operations with tunable parameters that allows you to automatically optimize
+# the computation for specific platforms.
diff --git a/gallery/user_tutorials/tvmc_command_line_driver.py b/gallery/user_tutorials/tvmc_command_line_driver.py
new file mode 100644
index 000000000000..c729b86a3245
--- /dev/null
+++ b/gallery/user_tutorials/tvmc_command_line_driver.py
@@ -0,0 +1,511 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compiling and Optimizing a Model with TVMC
+==========================================
+**Authors**:
+`Leandro Nunes <https://github.com/leandron>`_,
+`Matthew Barrett <https://github.com/mbaret>`_,
+`Chris Hoge <https://github.com/hogepodge>`_
+
+In this section, we will work with TVMC, the TVM command line driver. TVMC is a
+tool that exposes TVM features such as auto-tuning, compiling, profiling and
+execution of models through a command line interface.
+
+Upon completion of this section, we will have used TVMC to accomplish the
+following tasks:
+
+* Compile a pre-trained ResNet 50 v2 model for the TVM runtime.
+* Run a real image through the compiled model, and interpret the output and
+  model performance.
+* Tune the model on a CPU using TVM.
+* Re-compile an optimized model using the tuning data collected by TVM.
+* Run the image through the optimized model, and compare the output and model
+  performance.
+
+The goal of this section is to give you an overview of TVM and TVMC's
+capabilities, and set the stage for understanding how TVM works.
+"""
+
+################################################################################
+# Using TVMC
+# ----------
+#
+# TVMC is a Python application, part of the TVM Python package.
+# When you install TVM using a Python package, you will get TVMC as
+# as a command line application called ``tvmc``. The location of this command
+# will vary depending on your platform and installation method.
+#
+# Alternatively, if you have TVM as a Python module on your
+# ``$PYTHONPATH``,you can access the command line driver functionality
+# via the executable python module, ``python -m tvm.driver.tvmc``.
+#
+# For simplicity, this tutorial will mention TVMC command line using
+# ``tvmc <options>``, but the same results can be obtained with
+# ``python -m tvm.driver.tvmc <options>``.
+#
+# You can check the help page using:
+#
+# .. code-block:: bash
+#
+#   tvmc --help
+#
+# The main features of TVM available to ``tvmc`` are from subcommands
+# ``compile``, and ``run``, and ``tune``.  To read about specific options under
+# a given subcommand, use ``tvmc <subcommand> --help``. We will cover each of
+# these commands in this tutorial, but first we need to download a pre-trained
+# model to work with.
+#
+
+
+################################################################################
+# Obtaining the Model
+# -------------------
+#
+# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a
+# convolutional neural network that is 50-layers deep and designed to classify
+# images. The model we will be using has been pre-trained on more than a
+# million images with 1000 different classifications. The network has an input
+# image size of 224x224. If you are interested exploring more of how the
+# ResNet-50 model is structured, we recommend downloading `Netron
+# <https://netron.app>`, a freely available ML model viewer.
+#
+# For this tutorial we will be using the model in ONNX format.
+#
+# .. code-block:: bash
+#
+#   wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx
+#
+
+
+################################################################################
+# .. note:: Supported model formats
+#
+#   TVMC supports models created with Keras, ONNX, TensorFlow, TFLite
+#   and Torch. Use the option``--model-format`` if you need to
+#   explicitly provide the model format you are using. See ``tvmc
+#   compile --help`` for more information.
+#
+
+################################################################################
+# .. note:: Adding ONNX Support to TVM
+#
+#    TVM relies on the ONNX python library being available on your system. You
+#    can install ONNX using the command ``pip3 install --user onnx``. You may
+#    remove the ``--user`` option if you have root access and want to install
+#    ONNX globally.
+#
+
+################################################################################
+# Compiling an ONNX Model to the TVM Runtime
+# ------------------------------------------
+#
+# Once we've downloaded the ResNet-50 model, the next step is to compile it. To
+# accomplish that, we are going to use ``tvmc compile``. The output we get from
+# the compilation process is a TAR package of the model compiled to a dynamic
+# library for our target platform. We can run that model on our target device
+# using the TVM runtime.
+#
+# .. code-block:: bash
+#
+#   tvmc compile \
+#   --target "llvm" \
+#   --output resnet50-v2-7-tvm.tar \
+#   resnet50-v2-7.onnx
+#
+# Let's take a look at the files that ``tvmc compile`` creates in the module:
+#
+# .. code-block:: bash
+#
+# 	mkdir model
+# 	tar -xvf resnet50-v2-7-tvm.tar -C model
+# 	ls model
+#
+# You will see three files listed.
+#
+# * ``mod.so`` is the model, represented as a C++ library, that can be loaded
+#   by the TVM runtime.
+# * ``mod.json`` is a text representation of the TVM Relay computation graph.
+# * ``mod.params`` is a file containing the parameters for the pre-trained
+#   model.
+#
+# This module can be directly loaded by your application, and the model can be
+# run via the TVM runtime APIs.
+
+
+################################################################################
+# .. note:: Defining the Correct Target
+#
+#   Specifying the correct target (option ``--target``) can have a huge
+#   impact on the performance of the compiled module, as it can take
+#   advantage of hardware features available on the target. For more
+#   information, please refer to `Auto-tuning a convolutional network
+#   for x86 CPU <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
+#   We recommend identifying which CPU you are running, along with optional features,
+#   and set the target appropriately.
+#
+
+################################################################################
+# Running the Model from The Compiled Module with TVMC
+# ----------------------------------------------------
+#
+# Now that we've compiled the model to this module, we can use the TVM runtime
+# to make predictions with it. TVMC has the TVM runtime built in to it,
+# allowing you to run compiled TVM models. To use TVMC to run the model and
+# make predictions, we need two things:
+#
+# - The compiled module, which we just produced.
+# - Valid input to the model to make predictions on.
+#
+# Each model is particular when it comes to expected tensor shapes, formats and
+# data types. For this reason, most models require some pre and
+# post-processing, to ensure the input is valid and to interpret the output.
+# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This
+# is a well-supported NumPy format to serialize multiple arrays into a file
+#
+# As input for this tutorial, we will use the image of a cat, but you can feel
+# free to substitute image for any of your choosing.
+#
+# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg
+#    :height: 224px
+#    :width: 224px
+#    :align: center
+
+
+################################################################################
+# Input pre-processing
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# For our ResNet 50 V2 model, the input is expected to be in ImageNet format.
+# Here is an example of a script to pre-process an image for ResNet 50 V2.
+#
+# You will need to have a supported version of the Python Image Library
+# installed. You can use ``pip3 install --user pillow`` to satisfy this
+# requirement for the script.
+#
+# .. code-block:: python
+#    :caption: preprocess.py
+#    :name: preprocess.py
+#
+#     #!python ./preprocess.py
+#     from tvm.contrib.download import download_testdata
+#     from PIL import Image
+#     import numpy as np
+#
+#     img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
+#     img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
+#
+#     # Resize it to 224x224
+#     resized_image = Image.open(img_path).resize((224, 224))
+#     img_data = np.asarray(resized_image).astype("float32")
+#
+#     # ONNX expects NCHW input, so convert the array
+#     img_data = np.transpose(img_data, (2, 0, 1))
+#
+#     # Normalize according to ImageNet
+#     imagenet_mean = np.array([0.485, 0.456, 0.406])
+#     imagenet_stddev = np.array([0.229, 0.224, 0.225])
+#     norm_img_data = np.zeros(img_data.shape).astype("float32")
+#     for i in range(img_data.shape[0]):
+#    	    norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i]
+#
+#     # Add batch dimension
+#     img_data = np.expand_dims(norm_img_data, axis=0)
+#
+#     # Save to .npz (outputs imagenet_cat.npz)
+#     np.savez("imagenet_cat", data=img_data)
+#
+
+################################################################################
+# Running the Compiled Module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# With both the model and input data in hand, we can now run TVMC to make a
+# prediction:
+#
+# .. code-block:: bash
+#
+#     tvmc run \
+#     --inputs imagenet_cat.npz \
+#     --output predictions.npz \
+#     resnet50-v2-7-tvm.tar
+#
+# Recall that the `.tar` model file includes a C++ library, a description of
+# the Relay model, and the parameters for the model. TVMC includes the TVM
+# runtime, which can load the model and make predictions against input. When
+# running the above command, TVMC outputs a new file, ``predictions.npz``, that
+# contains the model output tensors in NumPy format.
+#
+# In this example, we are running the model on the same machine that we used
+# for compilation. In some cases we might want to run it remotely via an RPC
+# Tracker. To read more about these options please check ``tvmc run --help``.
+
+################################################################################
+# Output Post-Processing
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# As previously mentioned, each model will have its own particular way of
+# providing output tensors.
+#
+# In our case, we need to run some post-processing to render the outputs from
+# ResNet 50 V2 into a more human-readable form, using the lookup-table provided
+# for the model.
+#
+# The script below shows an example of the post-processing to extract labels
+# from the output of our compiled module.
+#
+# .. code-block:: python
+#     :caption: postprocess.py
+#     :name: postprocess.py
+#
+#     #!python ./postprocess.py
+#     import os.path
+#     import numpy as np
+#
+#     from scipy.special import softmax
+#
+#     from tvm.contrib.download import download_testdata
+#
+#     # Download a list of labels
+#     labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
+#     labels_path = download_testdata(labels_url, "synset.txt", module="data")
+#
+#     with open(labels_path, "r") as f:
+#         labels = [l.rstrip() for l in f]
+#
+#     output_file = "predictions.npz"
+#
+#     # Open the output and read the output tensor
+#     if os.path.exists(output_file):
+#         with np.load(output_file) as data:
+#             scores = softmax(data["output_0"])
+#             scores = np.squeeze(scores)
+#             ranks = np.argsort(scores)[::-1]
+#
+#             for rank in ranks[0:5]:
+#                 print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
+#
+# Running this script should produce the following output:
+#
+# .. code-block:: bash
+#
+#     python postprocess.py
+#
+#     # class='n02123045 tabby, tabby cat' with probability=0.610553
+#     # class='n02123159 tiger cat' with probability=0.367179
+#     # class='n02124075 Egyptian cat' with probability=0.019365
+#     # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#     # class='n04040759 radiator' with probability=0.000261
+#
+# Try replacing the cat image with other images, and see what sort of
+# predictions the ResNet model makes.
+
+################################################################################
+# Automatically Tuning the ResNet Model
+# -------------------------------------
+#
+# The previous model was compiled to work on the TVM runtime, but did not
+# include any platform specific optimization. In this section, we will show you
+# how to build an optimized model using TVMC to target your working platform.
+#
+# In some cases, we might not get the expected performance when running
+# inferences using our compiled module.  In cases like this, we can make use of
+# the auto-tuner, to find a better configuration for our model and get a boost
+# in performance. Tuning in TVM refers to the process by which a model is
+# optimized to run faster on a given target. This differs from training or
+# fine-tuning in that it does not affect the accuracy of the model, but only
+# the runtime performance. As part of the tuning process, TVM will try running
+# many different operator implementation variants to see which perform best.
+# The results of these runs are stored in a tuning records file, which is
+# ultimately the output of the ``tune`` subcommand.
+#
+# In the simplest form, tuning requires you to provide three things:
+#
+# - the target specification of the device you intend to run this model on
+# - the path to an output file in which the tuning records will be stored, and
+#   finally
+# - a path to the model to be tuned.
+#
+# The example below demonstrates how that works in practice:
+#
+# .. code-block:: bash
+#
+#     tvmc tune \
+#     --target "llvm" \
+#     --output resnet50-v2-7-autotuner_records.json \
+#     resnet50-v2-7.onnx
+#
+# In this example, you will see better results if you indicate a more specific
+# target for the `--target` flag.  For example, on an Intel i7 processor you
+# could use `--target llvm -mcpu=skylake`. For this tuning example, we are
+# tuning locally on the CPU using LLVM as the compiler for the specified
+# achitecture.
+#
+# TVMC will perform a search against the parameter space for the model, trying
+# out different configurations for operators and choosing the one that runs
+# fastest on your platform. Although this is a guided search based on the CPU
+# and model operations, it can still take several hours to complete the search.
+# The output of this search will be saved to the
+# `resnet50-v2-7-autotuner_records.json` file, which will later be used to
+# compile an optimized model.
+#
+# .. note:: Defining the Tuning Search Algorithm
+#
+#   By default this search is guided using an `XGBoost Grid` algorithm.
+#   Depending on your model complexity and amount of time avilable, you might
+#   want to choose a different algorithm. A full list is available by
+#   consulting ``tvmc tune --help``.
+#
+# The output will look something like this for a consumer-level Skylake CPU:
+#
+# .. code-block:: bash
+#
+#   tvmc tune   --target "llvm -mcpu=broadwell"   --output resnet50-v2-7-autotuner_records.json   resnet50-v2-7.onnx
+#   # [Task  1/24]  Current/Best:    9.65/  23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done.
+#   # [Task  1/24]  Current/Best:    3.56/  23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done.
+#   # [Task  2/24]  Current/Best:   13.13/  58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done.
+#   # [Task  3/24]  Current/Best:   31.93/  59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done.
+#   # [Task  4/24]  Current/Best:   16.42/  57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done.
+#   # [Task  5/24]  Current/Best:   12.42/  57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done.
+#   # [Task  6/24]  Current/Best:   20.66/  59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done.
+#   # [Task  7/24]  Current/Best:   15.48/  59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done.
+#   # [Task  8/24]  Current/Best:   31.97/  59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done.
+#   # [Task  9/24]  Current/Best:   34.14/  60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done.
+#   # [Task 10/24]  Current/Best:   12.53/  58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done.
+#   # [Task 11/24]  Current/Best:   30.94/  58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done.
+#   # [Task 12/24]  Current/Best:   23.66/  58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done.
+#   # [Task 13/24]  Current/Best:   25.44/  59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done.
+#   # [Task 14/24]  Current/Best:   26.83/  58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done.
+#   # [Task 15/24]  Current/Best:   33.64/  58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done.
+#   # [Task 16/24]  Current/Best:   14.93/  57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done.
+#   # [Task 17/24]  Current/Best:   28.70/  58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done.
+#   # [Task 18/24]  Current/Best:   19.01/  60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done.
+#   # [Task 19/24]  Current/Best:   14.61/  57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done.
+#   # [Task 20/24]  Current/Best:   10.47/  57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done.
+#   # [Task 21/24]  Current/Best:   34.37/  58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done.
+#   # [Task 22/24]  Current/Best:   15.75/  57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done.
+#   # [Task 23/24]  Current/Best:   23.23/  58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done.
+#   # [Task 24/24]  Current/Best:   17.27/  55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done.
+#
+# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning
+# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning
+# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information.
+#
+
+################################################################################
+# Compiling an Optimized Model with Tuning Data
+# ----------------------------------------------
+#
+# As an output of the tuning process above, we obtained the tuning records
+# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in
+# two ways:
+#
+# - As input to further tuning (via ``tvmc tune --tuning-records``).
+# - As input to the compiler
+#
+# The compiler will use the results to generate high performance code for the
+# model on your specified target. To do that we can use ``tvmc compile
+# --tuning-records``. Check ``tvmc compile --help`` for more information.
+#
+# Now that tuning data for the model has been collected, we can re-compile the
+# model using optimized operators to speed up our computations.
+#
+# .. code-block:: bash
+#
+#   tvmc compile \
+#   --target "llvm" \
+#   --tuning-records resnet50-v2-7-autotuner_records.json  \
+#   --output resnet50-v2-7-tvm_autotuned.tar \
+#   resnet50-v2-7.onnx
+#
+# Verify that the optimized model runs and produces the same results:
+#
+# .. code-block:: bash
+#
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz \
+#   resnet50-v2-7-tvm_autotuned.tar
+#
+#   python postprocess.py
+#
+# Verifying that the predictions are the same:
+#
+# .. code-block:: bash
+#
+#   # class='n02123045 tabby, tabby cat' with probability=0.610550
+#   # class='n02123159 tiger cat' with probability=0.367181
+#   # class='n02124075 Egyptian cat' with probability=0.019365
+#   # class='n02129604 tiger, Panthera tigris' with probability=0.001273
+#   # class='n04040759 radiator' with probability=0.000261
+
+################################################################################
+# Comparing the Tuned and Untuned Models
+# --------------------------------------
+#
+# TVMC gives you tools for basic performance benchmarking between the models.
+# You can specify a number of repetitions and that TVMC report on the model run
+# time (independent of runtime startup). We can get a rough idea of how much
+# tuning has improved the model performance. For example, on a test Intel i7
+# system, we see that the tuned model runs 47% faster than the untuned model:
+#
+# .. code-block:: bash
+#
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz  \
+#   --print-time \
+#   --repeat 100 \
+#   resnet50-v2-7-tvm_autotuned.tar
+#
+#   # Execution time summary:
+#   # mean (ms)   max (ms)    min (ms)    std (ms)
+#   #     92.19     115.73       89.85        3.15
+#
+#   tvmc run \
+#   --inputs imagenet_cat.npz \
+#   --output predictions.npz  \
+#   --print-time \
+#   --repeat 100 \
+#   resnet50-v2-7-tvm.tar
+#
+#   # Execution time summary:
+#   # mean (ms)   max (ms)    min (ms)    std (ms)
+#   #    193.32     219.97      185.04        7.11
+#
+
+
+################################################################################
+# Final Remarks
+# -------------
+#
+# In this tutorial, we presented TVMC, a command line driver for TVM. We
+# demonstrated how to compile, run, and tune a model. We also discussed the
+# need for pre and post-processing of inputs and outputs. After the tuning
+# process, we demonstrated how to compare the performance of the unoptimized
+# and optimize models.
+#
+# Here we presented a simple example using ResNet 50 V2 locally. However, TVMC
+# supports many more features including cross-compilation, remote execution and
+# profiling/benchmarking.
+#
+# To see what other options are available, please have a look at ``tvmc
+# --help``.
+#
+# In the next tutorial, `Compiling and Optimizing a Model with the Python
+# Interface <auto_tuning_with_pyton>`_, we will cover the same compilation
+# and optimization steps using the Python interface.

From 7c6d038139fecb90939b781d286359369cff4115 Mon Sep 17 00:00:00 2001
From: Chris Hoge <choge@octoml.ai>
Date: Fri, 13 Aug 2021 03:56:12 +0000
Subject: [PATCH 2/2] More documentation refactoring

---
 docs/conf.py                                  |  75 +-
 docs/dev_how_tos/index.rst                    | 117 ++-
 .../code_guide.rst                            |   0
 .../code_review.rst                           |   0
 .../committer_guide.rst                       |   0
 .../community.rst                             |   0
 .../{contribute => dev_tutorial}/document.rst |   0
 .../error_handling.rst                        |   0
 .../git_howto.rst                             |   0
 docs/{contribute => dev_tutorial}/index.rst   |   4 +-
 .../pull_request.rst                          |   0
 .../release_process.rst                       |   0
 docs/dev_tutorials/index.rst                  |  24 -
 docs/index.rst                                |   8 +-
 docs/user_how_tos/index.rst                   |  14 +-
 docs/{ => user_how_tos}/install/docker.rst    |   0
 .../install/from_source.rst                   |   0
 docs/{ => user_how_tos}/install/index.rst     |   2 +-
 docs/{ => user_how_tos}/install/nnpack.rst    |   0
 docs/user_reference/index.rst                 |   6 +-
 docs/user_tutorial/index.rst                  | 223 +++++
 docs/user_tutorials/index.rst                 |  25 -
 gallery/dev_how_tos/README.txt                |   5 +
 .../dev_how_tos}/bring_your_own_datatypes.py  |   0
 .../dev_how_tos}/low_level_custom_pass.py     |   0
 .../dev_how_tos}/use_pass_infra.py            |   0
 .../dev_how_tos}/use_pass_instrument.py       |   0
 .../user_how_tos/compile_models/README.txt    |   4 +
 .../compile_models}/from_caffe2.py            |   0
 .../compile_models}/from_coreml.py            |   0
 .../compile_models}/from_darknet.py           |   0
 .../compile_models}/from_keras.py             |   0
 .../compile_models}/from_mxnet.py             |   0
 .../user_how_tos/compile_models}/from_onnx.py |   0
 .../compile_models}/from_pytorch.py           |   0
 .../compile_models}/from_tensorflow.py        |   0
 .../compile_models}/from_tflite.py            |   0
 gallery/user_how_tos/deploy_models/README.txt |   4 +
 .../deploy_models}/deploy_model_on_android.py |   0
 .../deploy_models}/deploy_model_on_rasp.py    |   0
 .../deploy_object_detection_pytorch.py        |   0
 .../deploy_models}/deploy_prequantized.py     |   0
 .../deploy_prequantized_tflite.py             |   0
 .../deploy_models}/deploy_quantized.py        |   0
 .../deploy_models}/deploy_sparse.py           |   0
 .../deploy_models}/deploy_ssd_gluoncv.py      |   0
 .../optimize_tensor_operators}/README.txt     |   2 +
 .../opt_conv_cuda.py                          |   0
 .../opt_conv_tensorcore.py                    |   0
 .../optimize_tensor_operators}/opt_gemm.py    |   0
 .../tune_with_autoscheduler/README.txt        |   4 +
 .../ci_logs/conv2d.json                       |   0
 .../ci_logs/matmul.json                       |   0
 .../ci_logs/resnet-18-NHWC-B1-cuda.json       |   0
 .../ci_logs/resnet-50-NHWC-B1-llvm.json       |   0
 .../ci_logs/sparse_dense.json                 |   0
 .../tune_conv2d_layer_cuda.py                 |   0
 .../tune_network_arm.py                       |   0
 .../tune_network_cuda.py                      |   0
 .../tune_network_mali.py                      |   0
 .../tune_network_x86.py                       |   0
 .../tune_sparse_x86.py                        |   0
 .../tune_with_templates_autotvm/README.txt    |   4 +
 .../tune_conv2d_cuda.py                       |   0
 .../tune_relay_arm.py                         |   0
 .../tune_relay_cuda.py                        |   0
 .../tune_relay_mobile_gpu.py                  |   0
 .../tune_relay_x86.py                         |   0
 .../work_with_microtvm/README.txt             |   3 +
 .../work_with_microtvm}/micro_reference_vm.py |   0
 .../work_with_microtvm}/micro_tflite.py       |   0
 .../user_how_tos/work_with_relay/README.txt   |   4 +
 .../work_with_relay}/build_gcn.py             |   0
 .../work_with_relay}/using_external_lib.py    |   0
 .../work_with_te_schedules/README.txt         |   4 +
 .../work_with_te_schedules}/extern_op.py      |   0
 .../work_with_te_schedules}/intrin_math.py    |   0
 .../work_with_te_schedules}/reduction.py      |   0
 .../work_with_te_schedules}/scan.py           |   0
 .../schedule_primitives.py                    |   0
 .../work_with_te_schedules}/tedd.py           |   0
 .../work_with_te_schedules}/tensorize.py      |   0
 .../work_with_te_schedules}/tuple_inputs.py   |   0
 .../README.txt                                |   4 +-
 .../auto_scheduler_matmul_x86.py              |   0
 .../autotvm_matmul_x86.py                     |   0
 .../autotvm_relay_x86.py                      |   0
 .../cross_compilation_and_rpc.py              |   0
 .../install.py                                |   0
 .../user_tutorial}/intro_topi.py              |   0
 .../introduction.py                           |   0
 .../relay_quick_start.py                      |   0
 .../tensor_expr_get_started.py                |   0
 .../tvmc_command_line_driver.py               |   0
 tutorials/auto_scheduler/README.txt           |   2 -
 tutorials/autotvm/README.txt                  |   4 -
 tutorials/dev/README.txt                      |   3 -
 tutorials/frontend/README.txt                 |   4 -
 tutorials/get_started/README.txt              |   2 -
 .../get_started/auto_scheduler_matmul_x86.py  | 214 -----
 tutorials/get_started/autotvm_matmul_x86.py   | 377 --------
 tutorials/get_started/autotvm_relay_x86.py    | 476 ---------
 .../get_started/cross_compilation_and_rpc.py  | 265 -----
 tutorials/get_started/install.py              |  50 -
 tutorials/get_started/introduction.py         | 134 ---
 tutorials/get_started/relay_quick_start.py    | 155 ---
 .../get_started/tensor_expr_get_started.py    | 903 ------------------
 .../get_started/tvmc_command_line_driver.py   | 511 ----------
 tutorials/language/README.txt                 |   2 -
 tutorials/micro/README.txt                    |   4 -
 tutorials/topi/README.txt                     |   2 -
 111 files changed, 433 insertions(+), 3211 deletions(-)
 rename docs/{contribute => dev_tutorial}/code_guide.rst (100%)
 rename docs/{contribute => dev_tutorial}/code_review.rst (100%)
 rename docs/{contribute => dev_tutorial}/committer_guide.rst (100%)
 rename docs/{contribute => dev_tutorial}/community.rst (100%)
 rename docs/{contribute => dev_tutorial}/document.rst (100%)
 rename docs/{contribute => dev_tutorial}/error_handling.rst (100%)
 rename docs/{contribute => dev_tutorial}/git_howto.rst (100%)
 rename docs/{contribute => dev_tutorial}/index.rst (96%)
 rename docs/{contribute => dev_tutorial}/pull_request.rst (100%)
 rename docs/{contribute => dev_tutorial}/release_process.rst (100%)
 delete mode 100644 docs/dev_tutorials/index.rst
 rename docs/{ => user_how_tos}/install/docker.rst (100%)
 rename docs/{ => user_how_tos}/install/from_source.rst (100%)
 rename docs/{ => user_how_tos}/install/index.rst (98%)
 rename docs/{ => user_how_tos}/install/nnpack.rst (100%)
 create mode 100644 docs/user_tutorial/index.rst
 delete mode 100644 docs/user_tutorials/index.rst
 create mode 100644 gallery/dev_how_tos/README.txt
 rename {tutorials/dev => gallery/dev_how_tos}/bring_your_own_datatypes.py (100%)
 rename {tutorials/dev => gallery/dev_how_tos}/low_level_custom_pass.py (100%)
 rename {tutorials/dev => gallery/dev_how_tos}/use_pass_infra.py (100%)
 rename {tutorials/dev => gallery/dev_how_tos}/use_pass_instrument.py (100%)
 create mode 100644 gallery/user_how_tos/compile_models/README.txt
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_caffe2.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_coreml.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_darknet.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_keras.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_mxnet.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_onnx.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_pytorch.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_tensorflow.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_tflite.py (100%)
 create mode 100644 gallery/user_how_tos/deploy_models/README.txt
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_model_on_android.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_model_on_rasp.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_object_detection_pytorch.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_prequantized.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_prequantized_tflite.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_quantized.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_sparse.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_ssd_gluoncv.py (100%)
 rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/README.txt (53%)
 rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/opt_conv_cuda.py (100%)
 rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/opt_conv_tensorcore.py (100%)
 rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/opt_gemm.py (100%)
 create mode 100644 gallery/user_how_tos/tune_with_autoscheduler/README.txt
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/conv2d.json (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/matmul.json (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/resnet-18-NHWC-B1-cuda.json (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/resnet-50-NHWC-B1-llvm.json (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/sparse_dense.json (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_conv2d_layer_cuda.py (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_arm.py (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_cuda.py (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_mali.py (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_x86.py (100%)
 rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_sparse_x86.py (100%)
 create mode 100644 gallery/user_how_tos/tune_with_templates_autotvm/README.txt
 rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_conv2d_cuda.py (100%)
 rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_arm.py (100%)
 rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_cuda.py (100%)
 rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_mobile_gpu.py (100%)
 rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_x86.py (100%)
 create mode 100644 gallery/user_how_tos/work_with_microtvm/README.txt
 rename {tutorials/micro => gallery/user_how_tos/work_with_microtvm}/micro_reference_vm.py (100%)
 rename {tutorials/micro => gallery/user_how_tos/work_with_microtvm}/micro_tflite.py (100%)
 create mode 100644 gallery/user_how_tos/work_with_relay/README.txt
 rename {tutorials/frontend => gallery/user_how_tos/work_with_relay}/build_gcn.py (100%)
 rename {tutorials/frontend => gallery/user_how_tos/work_with_relay}/using_external_lib.py (100%)
 create mode 100644 gallery/user_how_tos/work_with_te_schedules/README.txt
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/extern_op.py (100%)
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/intrin_math.py (100%)
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/reduction.py (100%)
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/scan.py (100%)
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/schedule_primitives.py (100%)
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/tedd.py (100%)
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/tensorize.py (100%)
 rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/tuple_inputs.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/README.txt (70%)
 rename gallery/{user_tutorials => user_tutorial}/auto_scheduler_matmul_x86.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/autotvm_matmul_x86.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/autotvm_relay_x86.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/cross_compilation_and_rpc.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/install.py (100%)
 rename {tutorials/topi => gallery/user_tutorial}/intro_topi.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/introduction.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/relay_quick_start.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/tensor_expr_get_started.py (100%)
 rename gallery/{user_tutorials => user_tutorial}/tvmc_command_line_driver.py (100%)
 delete mode 100644 tutorials/auto_scheduler/README.txt
 delete mode 100644 tutorials/autotvm/README.txt
 delete mode 100644 tutorials/dev/README.txt
 delete mode 100644 tutorials/frontend/README.txt
 delete mode 100644 tutorials/get_started/README.txt
 delete mode 100644 tutorials/get_started/auto_scheduler_matmul_x86.py
 delete mode 100644 tutorials/get_started/autotvm_matmul_x86.py
 delete mode 100644 tutorials/get_started/autotvm_relay_x86.py
 delete mode 100644 tutorials/get_started/cross_compilation_and_rpc.py
 delete mode 100644 tutorials/get_started/install.py
 delete mode 100644 tutorials/get_started/introduction.py
 delete mode 100644 tutorials/get_started/relay_quick_start.py
 delete mode 100644 tutorials/get_started/tensor_expr_get_started.py
 delete mode 100644 tutorials/get_started/tvmc_command_line_driver.py
 delete mode 100644 tutorials/language/README.txt
 delete mode 100644 tutorials/micro/README.txt
 delete mode 100644 tutorials/topi/README.txt

diff --git a/docs/conf.py b/docs/conf.py
index 6962c448ed53..56506482bc55 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -197,12 +197,43 @@ def git_describe_version(original_version):
 
 from sphinx_gallery.sorting import ExplicitOrder
 
-examples_dirs = ["../gallery/user_tutorials", "../tutorials/", "../vta/tutorials/"]
-gallery_dirs = ["user_tutorials", "tutorials", "vta/tutorials"]
+# This variable specifices where splinx gallery should look
+# for gallery files.
+examples_dirs = ["../gallery/user_tutorial",
+        "../gallery/user_how_tos/compile_models",
+        "../gallery/user_how_tos/deploy_models",
+        "../gallery/user_how_tos/work_with_relay",
+        "../gallery/user_how_tos/work_with_te_schedules",
+        "../gallery/user_how_tos/optimize_tensor_operators",
+        "../gallery/user_how_tos/tune_with_templates_autotvm",
+        "../gallery/user_how_tos/tune_with_autoscheduler",
+        "../gallery/user_how_tos/work_with_microtvm",
+        "../gallery/dev_how_tos",
+        "../tutorials/",
+        "../vta/tutorials/"]
+
+# This variable specifies where sphinx gallery should place
+# output from gallery generation, relative to the top level
+# docd directory. The output will be mapped as an ordered
+# tuple to the example_dirs. For example, the output
+# from ../gallery/user_tutorials will be rendered
+# in tvm/docs/user_tutorials.
+gallery_dirs = ["user_tutorial",
+        "user_how_tos/compile_models",
+        "user_how_tos/deploy_models",
+        "user_how_tos/work_with_relay",
+        "user_how_tos/work_with_te_schedules",
+        "user_how_tos/optimize_tensor_operators",
+        "user_how_tos/tune_with_templates_autotvm",
+        "user_how_tos/tune_with_autoscheduler",
+        "user_how_tos/work_with_microtvm",
+        "dev_how_tos",
+        "tutorials",
+        "vta/tutorials"]
 
 subsection_order = ExplicitOrder(
     [
-        "../gallery/user_tutorials",
+        "../gallery/user_tutorial",
         "../tutorials/get_started",
         "../tutorials/frontend",
         "../tutorials/language",
@@ -224,7 +255,7 @@ def git_describe_version(original_version):
 # The unlisted files are sorted by filenames.
 # The unlisted files always appear after listed files.
 within_subsection_order = {
-    "user_tutorials": [
+    "user_tutorial": [
         "introduction.py",
         "install.py",
         "tvmc_command_line_driver.py",
@@ -233,20 +264,10 @@ def git_describe_version(original_version):
         "autotvm_matmul_x86.py",
         "auto_scheduler_matmul_x86.py",
         "cross_compilation_and_rpc.py",
-        "relay_quick_start.py",
+        "intro_topi.py",
+        "relay_quick_start.py"
     ],
-    "get_started": [
-        "introduction.py",
-        "install.py",
-        "tvmc_command_line_driver.py",
-        "autotvm_relay_x86.py",
-        "tensor_expr_get_started.py",
-        "autotvm_matmul_x86.py",
-        "auto_scheduler_matmul_x86.py",
-        "cross_compilation_and_rpc.py",
-        "relay_quick_start.py",
-    ],
-    "frontend": [
+    "compile_models": [
         "from_pytorch.py",
         "from_tensorflow.py",
         "from_mxnet.py",
@@ -257,7 +278,17 @@ def git_describe_version(original_version):
         "from_darknet.py",
         "from_caffe2.py",
     ],
-    "language": [
+    "deploy_models": [
+        "deploy_object_detection_pytorch.py",
+        "deploy_model_on_rasp.py",
+        "deploy_sparse.py",
+        "deploy_quantized.py",
+        "deploy_prequantized.py",
+        "deploy_prequantized_tflite.py",
+        "deploy_model_on_android.py",
+        "deploy_ssd_gluoncv.py"
+    ],
+    "work_with_te_schedules": [
         "schedule_primitives.py",
         "reduction.py",
         "intrin_math.py",
@@ -267,12 +298,12 @@ def git_describe_version(original_version):
         "tuple_inputs.py",
         "tedd.py",
     ],
-    "optimize": [
+    "optimize_tensor_operators": [
         "opt_gemm.py",
         "opt_conv_cuda.py",
         "opt_conv_tensorcore.py",
     ],
-    "autotvm": [
+    "tune_with_templates_autotvm": [
         "tune_simple_template.py",
         "tune_conv2d_cuda.py",
         "tune_relay_cuda.py",
@@ -280,13 +311,13 @@ def git_describe_version(original_version):
         "tune_relay_arm.py",
         "tune_relay_mobile_gpu.py",
     ],
-    "auto_scheduler": [
+    "tune_with_autoscheduler": [
         "tune_matmul_x86.py",
         "tune_conv2d_layer_cuda.py",
         "tune_network_x86.py",
         "tune_network_cuda.py",
     ],
-    "dev": [
+    "dev_how_tos": [
         "low_level_custom_pass.py",
         "use_pass_infra.py",
         "use_pass_instrument.py",
diff --git a/docs/dev_how_tos/index.rst b/docs/dev_how_tos/index.rst
index e8cce9365df2..a578a206cd0d 100644
--- a/docs/dev_how_tos/index.rst
+++ b/docs/dev_how_tos/index.rst
@@ -1,25 +1,104 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
+:orphan:
 
-..    http://www.apache.org/licenses/LICENSE-2.0
 
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
 
-.. _reference:
+.. _sphx_glr_dev_how_tos:
 
-Deep Dive
-=========
-TVM Developer Deep Dive
+How To
+------
+This gallery includes a number of how-tos on how to accomplish common
+develpment tasks in TVM.
 
-Refactor placeholder
 
+
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="TVM is a framework that abstracts away the heterogenity of machine learning accelerators. Somet...">
+
+.. only:: html
+
+    .. figure:: /dev_how_tos/images/thumb/sphx_glr_low_level_custom_pass_thumb.png
+
+        :ref:`sphx_glr_dev_how_tos_low_level_custom_pass.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /dev_how_tos/low_level_custom_pass
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="As the number of optimization passes increases in Relay/tir, it becomes intractable to execute ...">
+
+.. only:: html
+
+    .. figure:: /dev_how_tos/images/thumb/sphx_glr_use_pass_infra_thumb.png
+
+        :ref:`sphx_glr_dev_how_tos_use_pass_infra.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /dev_how_tos/use_pass_infra
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="As more and more passes are implemented, it becomes useful to instrument pass execution, analyz...">
+
+.. only:: html
+
+    .. figure:: /dev_how_tos/images/thumb/sphx_glr_use_pass_instrument_thumb.png
+
+        :ref:`sphx_glr_dev_how_tos_use_pass_instrument.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /dev_how_tos/use_pass_instrument
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="In this tutorial, we will show you how to utilize the Bring Your Own Datatypes framework to use...">
+
+.. only:: html
+
+    .. figure:: /dev_how_tos/images/thumb/sphx_glr_bring_your_own_datatypes_thumb.png
+
+        :ref:`sphx_glr_dev_how_tos_bring_your_own_datatypes.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /dev_how_tos/bring_your_own_datatypes
+.. raw:: html
+
+    <div style='clear:both'></div>
+
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
diff --git a/docs/contribute/code_guide.rst b/docs/dev_tutorial/code_guide.rst
similarity index 100%
rename from docs/contribute/code_guide.rst
rename to docs/dev_tutorial/code_guide.rst
diff --git a/docs/contribute/code_review.rst b/docs/dev_tutorial/code_review.rst
similarity index 100%
rename from docs/contribute/code_review.rst
rename to docs/dev_tutorial/code_review.rst
diff --git a/docs/contribute/committer_guide.rst b/docs/dev_tutorial/committer_guide.rst
similarity index 100%
rename from docs/contribute/committer_guide.rst
rename to docs/dev_tutorial/committer_guide.rst
diff --git a/docs/contribute/community.rst b/docs/dev_tutorial/community.rst
similarity index 100%
rename from docs/contribute/community.rst
rename to docs/dev_tutorial/community.rst
diff --git a/docs/contribute/document.rst b/docs/dev_tutorial/document.rst
similarity index 100%
rename from docs/contribute/document.rst
rename to docs/dev_tutorial/document.rst
diff --git a/docs/contribute/error_handling.rst b/docs/dev_tutorial/error_handling.rst
similarity index 100%
rename from docs/contribute/error_handling.rst
rename to docs/dev_tutorial/error_handling.rst
diff --git a/docs/contribute/git_howto.rst b/docs/dev_tutorial/git_howto.rst
similarity index 100%
rename from docs/contribute/git_howto.rst
rename to docs/dev_tutorial/git_howto.rst
diff --git a/docs/contribute/index.rst b/docs/dev_tutorial/index.rst
similarity index 96%
rename from docs/contribute/index.rst
rename to docs/dev_tutorial/index.rst
index e3e4119d803e..960071b189cb 100644
--- a/docs/contribute/index.rst
+++ b/docs/dev_tutorial/index.rst
@@ -15,8 +15,8 @@
     specific language governing permissions and limitations
     under the License.
 
-Contribute to TVM
-=================
+Tutorial: Contributing to TVM
+=============================
 
 TVM has been developed by community members.
 Everyone is welcomed to contribute.
diff --git a/docs/contribute/pull_request.rst b/docs/dev_tutorial/pull_request.rst
similarity index 100%
rename from docs/contribute/pull_request.rst
rename to docs/dev_tutorial/pull_request.rst
diff --git a/docs/contribute/release_process.rst b/docs/dev_tutorial/release_process.rst
similarity index 100%
rename from docs/contribute/release_process.rst
rename to docs/dev_tutorial/release_process.rst
diff --git a/docs/dev_tutorials/index.rst b/docs/dev_tutorials/index.rst
deleted file mode 100644
index 2705f21907b0..000000000000
--- a/docs/dev_tutorials/index.rst
+++ /dev/null
@@ -1,24 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-.. _tutorial:
-
-Tutorial
-========
-TVM Developer Tutorial
-
-Placeholder for documentation refactor
diff --git a/docs/index.rst b/docs/index.rst
index ad1cfc49d4dd..09f1e0ca727c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -35,21 +35,21 @@ For Developers
 - :doc:`dev/how_to` gives quick development tips on various topics.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
    :caption: TVM User Guide
    :hidden:
 
-   user_tutorials/index
+   user_tutorial/index
    user_how_tos/index
    user_deep_dive/index
    user_reference/index
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 3
    :caption: TVM Developer Guide
    :hidden:
 
-   dev_tutorials/index
+   dev_tutorial/index
    dev_how_tos/index
    dev_deep_dive/index
    dev_reference/index
diff --git a/docs/user_how_tos/index.rst b/docs/user_how_tos/index.rst
index cc59879e1057..8684a9e729c5 100644
--- a/docs/user_how_tos/index.rst
+++ b/docs/user_how_tos/index.rst
@@ -15,9 +15,21 @@
     specific language governing permissions and limitations
     under the License.
 
-.. _user_guide:
+.. _user_how_tos:
 
 How To
 =======
 TVM How Tos
 
+.. toctree::
+   :maxdepth: 2
+
+   install/index
+   compile_models/index
+   deploy_models/index
+   work_with_relay/index
+   work_with_te_schedules/index
+   optimize_tensor_operators/index
+   tune_with_templates_autotvm/index
+   tune_with_autoscheduler/index
+   work_with_microtvm/index
diff --git a/docs/install/docker.rst b/docs/user_how_tos/install/docker.rst
similarity index 100%
rename from docs/install/docker.rst
rename to docs/user_how_tos/install/docker.rst
diff --git a/docs/install/from_source.rst b/docs/user_how_tos/install/from_source.rst
similarity index 100%
rename from docs/install/from_source.rst
rename to docs/user_how_tos/install/from_source.rst
diff --git a/docs/install/index.rst b/docs/user_how_tos/install/index.rst
similarity index 98%
rename from docs/install/index.rst
rename to docs/user_how_tos/install/index.rst
index 5f739418add3..3d1a06a74be7 100644
--- a/docs/install/index.rst
+++ b/docs/user_how_tos/install/index.rst
@@ -17,7 +17,7 @@
 
 .. _installation:
 
-Installation
+Install TVM
 ============
 To install TVM, please read :ref:`install-from-source`.
 If you are interested in deploying to mobile/embedded devices,
diff --git a/docs/install/nnpack.rst b/docs/user_how_tos/install/nnpack.rst
similarity index 100%
rename from docs/install/nnpack.rst
rename to docs/user_how_tos/install/nnpack.rst
diff --git a/docs/user_reference/index.rst b/docs/user_reference/index.rst
index cc59879e1057..1f715217b2a1 100644
--- a/docs/user_reference/index.rst
+++ b/docs/user_reference/index.rst
@@ -17,7 +17,7 @@
 
 .. _user_guide:
 
-How To
-=======
-TVM How Tos
+User Reference
+==============
+TVM User Reference
 
diff --git a/docs/user_tutorial/index.rst b/docs/user_tutorial/index.rst
new file mode 100644
index 000000000000..d239fc6a459d
--- /dev/null
+++ b/docs/user_tutorial/index.rst
@@ -0,0 +1,223 @@
+:orphan:
+
+
+
+.. _sphx_glr_user_tutorial:
+
+Tutorial
+--------
+
+Within this gallery is an introduction to TVM.
+
+
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="Apache TVM is an open source machine learning compiler framework for CPUs, GPUs, and machine le...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_introduction_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_introduction.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/introduction
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="Depending on your needs and your working environment, there are a few different methods for ins...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_install_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_install.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/install
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="In this section, we will work with TVMC, the TVM command line driver. TVMC is a tool that expos...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_tvmc_command_line_driver_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_tvmc_command_line_driver.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/tvmc_command_line_driver
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="In the `TVMC Tutorial &lt;tvmc_command_line_driver&gt;`_, we covered how to compile, run, and tune a ...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_autotvm_relay_x86_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_autotvm_relay_x86.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/autotvm_relay_x86
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="In this tutorial we will turn our attention to how TVM works with Tensor Expression (TE) to def...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_tensor_expr_get_started_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_tensor_expr_get_started.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/tensor_expr_get_started
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="In this tutorial, we show how the TVM Tensor Expression (TE) language can be used to write sche...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_autotvm_matmul_x86_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_autotvm_matmul_x86.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/autotvm_matmul_x86
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="In this tutorial, we will show how TVM&#x27;s Auto Scheduling feature can find optimal schedules wit...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_auto_scheduler_matmul_x86_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_auto_scheduler_matmul_x86.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/auto_scheduler_matmul_x86
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="This tutorial introduces cross compilation and remote device execution with RPC in TVM.">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_cross_compilation_and_rpc_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_cross_compilation_and_rpc.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/cross_compilation_and_rpc
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="This is an introductory tutorial to TVM Operator Inventory (TOPI). TOPI provides numpy-style ge...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_intro_topi_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_intro_topi.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/intro_topi
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="This example shows how to build a neural network with Relay python frontend and generates a run...">
+
+.. only:: html
+
+    .. figure:: /user_tutorial/images/thumb/sphx_glr_relay_quick_start_thumb.png
+
+        :ref:`sphx_glr_user_tutorial_relay_quick_start.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /user_tutorial/relay_quick_start
+.. raw:: html
+
+    <div style='clear:both'></div>
+
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
diff --git a/docs/user_tutorials/index.rst b/docs/user_tutorials/index.rst
deleted file mode 100644
index 4f01312426f2..000000000000
--- a/docs/user_tutorials/index.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-.. _user_guide:
-
-Tutorial
-=========
-TVM User Tutorial
-
-This is a placeholder and is overwritten by Spinx Gallery
-/tvm/gallery/user_tutorials
diff --git a/gallery/dev_how_tos/README.txt b/gallery/dev_how_tos/README.txt
new file mode 100644
index 000000000000..10c824a554f4
--- /dev/null
+++ b/gallery/dev_how_tos/README.txt
@@ -0,0 +1,5 @@
+How To
+------
+This gallery includes a number of how-tos on how to accomplish common
+develpment tasks in TVM.
+
diff --git a/tutorials/dev/bring_your_own_datatypes.py b/gallery/dev_how_tos/bring_your_own_datatypes.py
similarity index 100%
rename from tutorials/dev/bring_your_own_datatypes.py
rename to gallery/dev_how_tos/bring_your_own_datatypes.py
diff --git a/tutorials/dev/low_level_custom_pass.py b/gallery/dev_how_tos/low_level_custom_pass.py
similarity index 100%
rename from tutorials/dev/low_level_custom_pass.py
rename to gallery/dev_how_tos/low_level_custom_pass.py
diff --git a/tutorials/dev/use_pass_infra.py b/gallery/dev_how_tos/use_pass_infra.py
similarity index 100%
rename from tutorials/dev/use_pass_infra.py
rename to gallery/dev_how_tos/use_pass_infra.py
diff --git a/tutorials/dev/use_pass_instrument.py b/gallery/dev_how_tos/use_pass_instrument.py
similarity index 100%
rename from tutorials/dev/use_pass_instrument.py
rename to gallery/dev_how_tos/use_pass_instrument.py
diff --git a/gallery/user_how_tos/compile_models/README.txt b/gallery/user_how_tos/compile_models/README.txt
new file mode 100644
index 000000000000..871d975ab68d
--- /dev/null
+++ b/gallery/user_how_tos/compile_models/README.txt
@@ -0,0 +1,4 @@
+Compile Deep Learning Models
+============================
+
+How to compile models from a variety of different frameworks
diff --git a/tutorials/frontend/from_caffe2.py b/gallery/user_how_tos/compile_models/from_caffe2.py
similarity index 100%
rename from tutorials/frontend/from_caffe2.py
rename to gallery/user_how_tos/compile_models/from_caffe2.py
diff --git a/tutorials/frontend/from_coreml.py b/gallery/user_how_tos/compile_models/from_coreml.py
similarity index 100%
rename from tutorials/frontend/from_coreml.py
rename to gallery/user_how_tos/compile_models/from_coreml.py
diff --git a/tutorials/frontend/from_darknet.py b/gallery/user_how_tos/compile_models/from_darknet.py
similarity index 100%
rename from tutorials/frontend/from_darknet.py
rename to gallery/user_how_tos/compile_models/from_darknet.py
diff --git a/tutorials/frontend/from_keras.py b/gallery/user_how_tos/compile_models/from_keras.py
similarity index 100%
rename from tutorials/frontend/from_keras.py
rename to gallery/user_how_tos/compile_models/from_keras.py
diff --git a/tutorials/frontend/from_mxnet.py b/gallery/user_how_tos/compile_models/from_mxnet.py
similarity index 100%
rename from tutorials/frontend/from_mxnet.py
rename to gallery/user_how_tos/compile_models/from_mxnet.py
diff --git a/tutorials/frontend/from_onnx.py b/gallery/user_how_tos/compile_models/from_onnx.py
similarity index 100%
rename from tutorials/frontend/from_onnx.py
rename to gallery/user_how_tos/compile_models/from_onnx.py
diff --git a/tutorials/frontend/from_pytorch.py b/gallery/user_how_tos/compile_models/from_pytorch.py
similarity index 100%
rename from tutorials/frontend/from_pytorch.py
rename to gallery/user_how_tos/compile_models/from_pytorch.py
diff --git a/tutorials/frontend/from_tensorflow.py b/gallery/user_how_tos/compile_models/from_tensorflow.py
similarity index 100%
rename from tutorials/frontend/from_tensorflow.py
rename to gallery/user_how_tos/compile_models/from_tensorflow.py
diff --git a/tutorials/frontend/from_tflite.py b/gallery/user_how_tos/compile_models/from_tflite.py
similarity index 100%
rename from tutorials/frontend/from_tflite.py
rename to gallery/user_how_tos/compile_models/from_tflite.py
diff --git a/gallery/user_how_tos/deploy_models/README.txt b/gallery/user_how_tos/deploy_models/README.txt
new file mode 100644
index 000000000000..cf8abf4662ff
--- /dev/null
+++ b/gallery/user_how_tos/deploy_models/README.txt
@@ -0,0 +1,4 @@
+Deploy Deep Learning Models
+============================
+
+How to deploy models from a variety of different frameworks
diff --git a/tutorials/frontend/deploy_model_on_android.py b/gallery/user_how_tos/deploy_models/deploy_model_on_android.py
similarity index 100%
rename from tutorials/frontend/deploy_model_on_android.py
rename to gallery/user_how_tos/deploy_models/deploy_model_on_android.py
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/gallery/user_how_tos/deploy_models/deploy_model_on_rasp.py
similarity index 100%
rename from tutorials/frontend/deploy_model_on_rasp.py
rename to gallery/user_how_tos/deploy_models/deploy_model_on_rasp.py
diff --git a/tutorials/frontend/deploy_object_detection_pytorch.py b/gallery/user_how_tos/deploy_models/deploy_object_detection_pytorch.py
similarity index 100%
rename from tutorials/frontend/deploy_object_detection_pytorch.py
rename to gallery/user_how_tos/deploy_models/deploy_object_detection_pytorch.py
diff --git a/tutorials/frontend/deploy_prequantized.py b/gallery/user_how_tos/deploy_models/deploy_prequantized.py
similarity index 100%
rename from tutorials/frontend/deploy_prequantized.py
rename to gallery/user_how_tos/deploy_models/deploy_prequantized.py
diff --git a/tutorials/frontend/deploy_prequantized_tflite.py b/gallery/user_how_tos/deploy_models/deploy_prequantized_tflite.py
similarity index 100%
rename from tutorials/frontend/deploy_prequantized_tflite.py
rename to gallery/user_how_tos/deploy_models/deploy_prequantized_tflite.py
diff --git a/tutorials/frontend/deploy_quantized.py b/gallery/user_how_tos/deploy_models/deploy_quantized.py
similarity index 100%
rename from tutorials/frontend/deploy_quantized.py
rename to gallery/user_how_tos/deploy_models/deploy_quantized.py
diff --git a/tutorials/frontend/deploy_sparse.py b/gallery/user_how_tos/deploy_models/deploy_sparse.py
similarity index 100%
rename from tutorials/frontend/deploy_sparse.py
rename to gallery/user_how_tos/deploy_models/deploy_sparse.py
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/gallery/user_how_tos/deploy_models/deploy_ssd_gluoncv.py
similarity index 100%
rename from tutorials/frontend/deploy_ssd_gluoncv.py
rename to gallery/user_how_tos/deploy_models/deploy_ssd_gluoncv.py
diff --git a/tutorials/optimize/README.txt b/gallery/user_how_tos/optimize_tensor_operators/README.txt
similarity index 53%
rename from tutorials/optimize/README.txt
rename to gallery/user_how_tos/optimize_tensor_operators/README.txt
index b051548c5351..a4a3c84c2018 100644
--- a/tutorials/optimize/README.txt
+++ b/gallery/user_how_tos/optimize_tensor_operators/README.txt
@@ -1,2 +1,4 @@
 Optimize Tensor Operators
 -------------------------
+
+How to optimize tensor operators using TVM.
diff --git a/tutorials/optimize/opt_conv_cuda.py b/gallery/user_how_tos/optimize_tensor_operators/opt_conv_cuda.py
similarity index 100%
rename from tutorials/optimize/opt_conv_cuda.py
rename to gallery/user_how_tos/optimize_tensor_operators/opt_conv_cuda.py
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/gallery/user_how_tos/optimize_tensor_operators/opt_conv_tensorcore.py
similarity index 100%
rename from tutorials/optimize/opt_conv_tensorcore.py
rename to gallery/user_how_tos/optimize_tensor_operators/opt_conv_tensorcore.py
diff --git a/tutorials/optimize/opt_gemm.py b/gallery/user_how_tos/optimize_tensor_operators/opt_gemm.py
similarity index 100%
rename from tutorials/optimize/opt_gemm.py
rename to gallery/user_how_tos/optimize_tensor_operators/opt_gemm.py
diff --git a/gallery/user_how_tos/tune_with_autoscheduler/README.txt b/gallery/user_how_tos/tune_with_autoscheduler/README.txt
new file mode 100644
index 000000000000..6f69d8aa6f61
--- /dev/null
+++ b/gallery/user_how_tos/tune_with_autoscheduler/README.txt
@@ -0,0 +1,4 @@
+How to use AutoScheduler for Template-Free Auto Scheduling
+----------------------------------------------------------
+
+How to use the TVM AutoScheduler to automatically tune a schedule, template-free.
diff --git a/tutorials/auto_scheduler/ci_logs/conv2d.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/conv2d.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/conv2d.json
rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/conv2d.json
diff --git a/tutorials/auto_scheduler/ci_logs/matmul.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/matmul.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/matmul.json
rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/matmul.json
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
diff --git a/tutorials/auto_scheduler/ci_logs/sparse_dense.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/sparse_dense.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/sparse_dense.json
rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/sparse_dense.json
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
rename to gallery/user_how_tos/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_arm.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_arm.py
rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_arm.py
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_cuda.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_cuda.py
rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_cuda.py
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_mali.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_mali.py
rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_mali.py
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_x86.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_x86.py
rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_x86.py
diff --git a/tutorials/auto_scheduler/tune_sparse_x86.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_sparse_x86.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_sparse_x86.py
rename to gallery/user_how_tos/tune_with_autoscheduler/tune_sparse_x86.py
diff --git a/gallery/user_how_tos/tune_with_templates_autotvm/README.txt b/gallery/user_how_tos/tune_with_templates_autotvm/README.txt
new file mode 100644
index 000000000000..99d844fef2d9
--- /dev/null
+++ b/gallery/user_how_tos/tune_with_templates_autotvm/README.txt
@@ -0,0 +1,4 @@
+How to Auto-Tune with Templates and AutoTVM
+-------------------------------------------
+
+How to write schedule templates and auto-tune with AutoTVM
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_conv2d_cuda.py
similarity index 100%
rename from tutorials/autotvm/tune_conv2d_cuda.py
rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_conv2d_cuda.py
diff --git a/tutorials/autotvm/tune_relay_arm.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_arm.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_arm.py
rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_arm.py
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_cuda.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_cuda.py
rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_cuda.py
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_mobile_gpu.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_mobile_gpu.py
rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_mobile_gpu.py
diff --git a/tutorials/autotvm/tune_relay_x86.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_x86.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_x86.py
rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_x86.py
diff --git a/gallery/user_how_tos/work_with_microtvm/README.txt b/gallery/user_how_tos/work_with_microtvm/README.txt
new file mode 100644
index 000000000000..c77817dd9b7e
--- /dev/null
+++ b/gallery/user_how_tos/work_with_microtvm/README.txt
@@ -0,0 +1,3 @@
+How to Work With microTVM
+--------
+How to work with microTVM
diff --git a/tutorials/micro/micro_reference_vm.py b/gallery/user_how_tos/work_with_microtvm/micro_reference_vm.py
similarity index 100%
rename from tutorials/micro/micro_reference_vm.py
rename to gallery/user_how_tos/work_with_microtvm/micro_reference_vm.py
diff --git a/tutorials/micro/micro_tflite.py b/gallery/user_how_tos/work_with_microtvm/micro_tflite.py
similarity index 100%
rename from tutorials/micro/micro_tflite.py
rename to gallery/user_how_tos/work_with_microtvm/micro_tflite.py
diff --git a/gallery/user_how_tos/work_with_relay/README.txt b/gallery/user_how_tos/work_with_relay/README.txt
new file mode 100644
index 000000000000..20f7a076e397
--- /dev/null
+++ b/gallery/user_how_tos/work_with_relay/README.txt
@@ -0,0 +1,4 @@
+Work With Relay
+===============
+
+How tos describing advanced compilation and deployment techniques with Relay
diff --git a/tutorials/frontend/build_gcn.py b/gallery/user_how_tos/work_with_relay/build_gcn.py
similarity index 100%
rename from tutorials/frontend/build_gcn.py
rename to gallery/user_how_tos/work_with_relay/build_gcn.py
diff --git a/tutorials/frontend/using_external_lib.py b/gallery/user_how_tos/work_with_relay/using_external_lib.py
similarity index 100%
rename from tutorials/frontend/using_external_lib.py
rename to gallery/user_how_tos/work_with_relay/using_external_lib.py
diff --git a/gallery/user_how_tos/work_with_te_schedules/README.txt b/gallery/user_how_tos/work_with_te_schedules/README.txt
new file mode 100644
index 000000000000..12499aebc720
--- /dev/null
+++ b/gallery/user_how_tos/work_with_te_schedules/README.txt
@@ -0,0 +1,4 @@
+Work with Tensor Expression and Schedules
+-------------------------------
+
+How to work with Tensor Expressions and Schedules
diff --git a/tutorials/language/extern_op.py b/gallery/user_how_tos/work_with_te_schedules/extern_op.py
similarity index 100%
rename from tutorials/language/extern_op.py
rename to gallery/user_how_tos/work_with_te_schedules/extern_op.py
diff --git a/tutorials/language/intrin_math.py b/gallery/user_how_tos/work_with_te_schedules/intrin_math.py
similarity index 100%
rename from tutorials/language/intrin_math.py
rename to gallery/user_how_tos/work_with_te_schedules/intrin_math.py
diff --git a/tutorials/language/reduction.py b/gallery/user_how_tos/work_with_te_schedules/reduction.py
similarity index 100%
rename from tutorials/language/reduction.py
rename to gallery/user_how_tos/work_with_te_schedules/reduction.py
diff --git a/tutorials/language/scan.py b/gallery/user_how_tos/work_with_te_schedules/scan.py
similarity index 100%
rename from tutorials/language/scan.py
rename to gallery/user_how_tos/work_with_te_schedules/scan.py
diff --git a/tutorials/language/schedule_primitives.py b/gallery/user_how_tos/work_with_te_schedules/schedule_primitives.py
similarity index 100%
rename from tutorials/language/schedule_primitives.py
rename to gallery/user_how_tos/work_with_te_schedules/schedule_primitives.py
diff --git a/tutorials/language/tedd.py b/gallery/user_how_tos/work_with_te_schedules/tedd.py
similarity index 100%
rename from tutorials/language/tedd.py
rename to gallery/user_how_tos/work_with_te_schedules/tedd.py
diff --git a/tutorials/language/tensorize.py b/gallery/user_how_tos/work_with_te_schedules/tensorize.py
similarity index 100%
rename from tutorials/language/tensorize.py
rename to gallery/user_how_tos/work_with_te_schedules/tensorize.py
diff --git a/tutorials/language/tuple_inputs.py b/gallery/user_how_tos/work_with_te_schedules/tuple_inputs.py
similarity index 100%
rename from tutorials/language/tuple_inputs.py
rename to gallery/user_how_tos/work_with_te_schedules/tuple_inputs.py
diff --git a/gallery/user_tutorials/README.txt b/gallery/user_tutorial/README.txt
similarity index 70%
rename from gallery/user_tutorials/README.txt
rename to gallery/user_tutorial/README.txt
index 0d5a287feff4..ee90cd68eefc 100644
--- a/gallery/user_tutorials/README.txt
+++ b/gallery/user_tutorial/README.txt
@@ -1,4 +1,4 @@
-Tutorials
----------
+Tutorial
+--------
 
 Within this gallery is an introduction to TVM.
diff --git a/gallery/user_tutorials/auto_scheduler_matmul_x86.py b/gallery/user_tutorial/auto_scheduler_matmul_x86.py
similarity index 100%
rename from gallery/user_tutorials/auto_scheduler_matmul_x86.py
rename to gallery/user_tutorial/auto_scheduler_matmul_x86.py
diff --git a/gallery/user_tutorials/autotvm_matmul_x86.py b/gallery/user_tutorial/autotvm_matmul_x86.py
similarity index 100%
rename from gallery/user_tutorials/autotvm_matmul_x86.py
rename to gallery/user_tutorial/autotvm_matmul_x86.py
diff --git a/gallery/user_tutorials/autotvm_relay_x86.py b/gallery/user_tutorial/autotvm_relay_x86.py
similarity index 100%
rename from gallery/user_tutorials/autotvm_relay_x86.py
rename to gallery/user_tutorial/autotvm_relay_x86.py
diff --git a/gallery/user_tutorials/cross_compilation_and_rpc.py b/gallery/user_tutorial/cross_compilation_and_rpc.py
similarity index 100%
rename from gallery/user_tutorials/cross_compilation_and_rpc.py
rename to gallery/user_tutorial/cross_compilation_and_rpc.py
diff --git a/gallery/user_tutorials/install.py b/gallery/user_tutorial/install.py
similarity index 100%
rename from gallery/user_tutorials/install.py
rename to gallery/user_tutorial/install.py
diff --git a/tutorials/topi/intro_topi.py b/gallery/user_tutorial/intro_topi.py
similarity index 100%
rename from tutorials/topi/intro_topi.py
rename to gallery/user_tutorial/intro_topi.py
diff --git a/gallery/user_tutorials/introduction.py b/gallery/user_tutorial/introduction.py
similarity index 100%
rename from gallery/user_tutorials/introduction.py
rename to gallery/user_tutorial/introduction.py
diff --git a/gallery/user_tutorials/relay_quick_start.py b/gallery/user_tutorial/relay_quick_start.py
similarity index 100%
rename from gallery/user_tutorials/relay_quick_start.py
rename to gallery/user_tutorial/relay_quick_start.py
diff --git a/gallery/user_tutorials/tensor_expr_get_started.py b/gallery/user_tutorial/tensor_expr_get_started.py
similarity index 100%
rename from gallery/user_tutorials/tensor_expr_get_started.py
rename to gallery/user_tutorial/tensor_expr_get_started.py
diff --git a/gallery/user_tutorials/tvmc_command_line_driver.py b/gallery/user_tutorial/tvmc_command_line_driver.py
similarity index 100%
rename from gallery/user_tutorials/tvmc_command_line_driver.py
rename to gallery/user_tutorial/tvmc_command_line_driver.py
diff --git a/tutorials/auto_scheduler/README.txt b/tutorials/auto_scheduler/README.txt
deleted file mode 100644
index 75986679f0bd..000000000000
--- a/tutorials/auto_scheduler/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-AutoScheduler : Template-free Auto Scheduling
----------------------------------------------
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
deleted file mode 100644
index a1d33ba088cc..000000000000
--- a/tutorials/autotvm/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorials-autotvm-sec:
-
-AutoTVM : Template-based Auto Tuning
-------------------------------------
diff --git a/tutorials/dev/README.txt b/tutorials/dev/README.txt
deleted file mode 100644
index a358280640de..000000000000
--- a/tutorials/dev/README.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Developer Tutorials
--------------------
-
diff --git a/tutorials/frontend/README.txt b/tutorials/frontend/README.txt
deleted file mode 100644
index 319506d21f8f..000000000000
--- a/tutorials/frontend/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorial-frontend:
-
-Compile Deep Learning Models
-----------------------------
diff --git a/tutorials/get_started/README.txt b/tutorials/get_started/README.txt
deleted file mode 100644
index aa6c559c1b38..000000000000
--- a/tutorials/get_started/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Getting Started With TVM
-------------------------
diff --git a/tutorials/get_started/auto_scheduler_matmul_x86.py b/tutorials/get_started/auto_scheduler_matmul_x86.py
deleted file mode 100644
index f9fb3615aedc..000000000000
--- a/tutorials/get_started/auto_scheduler_matmul_x86.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Optimizing Operators with Auto-scheduling
-=========================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
-            `Chengfan Jia <https://github.com/jcf94/>`_
-
-In this tutorial, we will show how TVM's Auto Scheduling feature can find
-optimal schedules without the need for writing a custom template.
-
-Different from the template-based :doc:`AutoTVM <autotvm_matmul_x86>` which relies on
-manual templates to define the search space, the auto-scheduler does not
-require any templates.  Users only need to write the computation declaration
-without any schedule commands or templates.  The auto-scheduler can
-automatically generate a large search space and find a good schedule in the
-space.
-
-We use matrix multiplication as an example in this tutorial.
-
-.. note::
-  Note that this tutorial will not run on Windows or recent versions of macOS. To
-  get it to run, you will need to wrap the body of this tutorial in a :code:`if
-  __name__ == "__main__":` block.
-"""
-
-import os
-
-import numpy as np
-import tvm
-from tvm import te, auto_scheduler
-
-################################################################################
-# Defining the Matrix Multiplication
-# ----------------------------------
-# To start, we define a matrix multiplication with a bias addition.  Note that
-# this uses standard operations available in TVMs Tensor Expression language.
-# The major difference is the use of the `auto_sceduler` decorator at the top
-# of the function definition.  The function should return a list of
-# input/output tensors.  From these tensors, the auto-scheduler can get the
-# whole computational graph.
-
-
-@auto_scheduler.register_workload  # Note the auto_scheduler decorator
-def matmul_add(N, L, M, dtype):
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-    C = te.placeholder((N, M), name="C", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    matmul = te.compute(
-        (N, M),
-        lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
-        name="matmul",
-        attrs={"layout_free_placeholders": [B]},  # enable automatic layout transform for tensor B
-    )
-    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
-
-    return [A, B, C, out]
-
-
-################################################################################
-# Create the search task
-# ----------------------
-# With the function defined, we can now create the task for the auto_scheduler
-# to search against. We specify the particular parameters for this matrix
-# multiplication, in this case a multiplication of to square matricies of size
-# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32"
-#
-# .. note:: Improve performance with custom targets
-#   In order for TVM to take full advantage of specific hardware platforms,
-#   you will want to manuall specify your CPU capabilities. For example:
-#   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
-#   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512
-
-target = tvm.target.Target("llvm")
-N = L = M = 1024
-task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target)
-
-# Inspect the computational graph
-print("Computational DAG:")
-print(task.compute_dag)
-
-################################################################################
-# Set Parameters for Auto-Scheduler
-# ---------------------------------
-# Next, we set parameters for the auto-scheduler.
-#
-# * :code:`num_measure_trials` is the number of measurement trials we can use
-#   during the search.  We only make 10 trials in this tutorial for a fast
-#   demonstration. In practice, 1000 is a good value for the search to converge.
-#   You can do more trials according to your time budget.
-# * In addition, we use :code:`RecordToFile` to log measurement records into a
-#   file `matmul.json`.  The measurement records can be used to query the history
-#   best, resume the search, and do more analyses later.
-# * see :any:`auto_scheduler.TuningOptions` for more parameters
-
-log_file = "matmul.json"
-tune_option = auto_scheduler.TuningOptions(
-    num_measure_trials=10,
-    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-    verbose=2,
-)
-
-################################################################################
-# Run the search
-# --------------
-# Now we get all inputs ready. Pretty simple, isn't it?  We can kick off the
-# search and let the auto-scheduler do its magic.  After some measurement
-# trials, we can load the best schedule from the log file and apply it.
-
-# Run auto-tuning (search)
-task.tune(tune_option)
-# Apply the best schedule
-sch, args = task.apply_best(log_file)
-
-################################################################################
-# Inspecting the Optimized Schedule
-# ---------------------------------
-# We can lower the schedule to see the IR after auto-scheduling.  The
-# auto-scheduler correctly performs optimizations including multi-level tiling,
-# layout transformation, parallelization, vectorization, unrolling, and
-# operator fusion.
-
-print("Lowered TIR:")
-print(tvm.lower(sch, args, simple_mode=True))
-
-################################################################################
-# Check correctness and evaluate performance
-# ------------------------------------------
-# We build the binary and check its correctness and performance.
-
-func = tvm.build(sch, args, target)
-a_np = np.random.uniform(size=(N, L)).astype(np.float32)
-b_np = np.random.uniform(size=(L, M)).astype(np.float32)
-c_np = np.random.uniform(size=(N, M)).astype(np.float32)
-out_np = a_np.dot(b_np) + c_np
-
-dev = tvm.cpu()
-a_tvm = tvm.nd.array(a_np, device=dev)
-b_tvm = tvm.nd.array(b_np, device=dev)
-c_tvm = tvm.nd.array(c_np, device=dev)
-out_tvm = tvm.nd.empty(out_np.shape, device=dev)
-func(a_tvm, b_tvm, c_tvm, out_tvm)
-
-# Check results
-np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)
-
-# Evaluate execution time.
-evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
-print(
-    "Execution time of this operator: %.3f ms"
-    % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
-)
-
-
-################################################################################
-# Using the record file
-# ---------------------
-# During the search, all measurement records are logged into the record file
-# "matmul.json". The measurement records can be used to re-apply search
-# results, resume the search, and perform other analyses.
-#
-# Here is an example where we load the best schedule from a file, and print the
-# equivalent python schedule API. This can be used for debugging and learning
-# the behavior of the auto-scheduler.
-
-print("Equivalent python schedule:")
-print(task.print_best(log_file))
-
-################################################################################
-# A more complicated example is to resume the search.  In this case, we need to
-# create the search policy and cost model by ourselves and resume the status of
-# search policy and cost model with the log file.  In the example below we
-# resume the status and do more 5 trials.
-
-
-def resume_search(task, log_file):
-    print("Resume search:")
-    cost_model = auto_scheduler.XGBModel()
-    cost_model.update_from_file(log_file)
-    search_policy = auto_scheduler.SketchPolicy(
-        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
-    )
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
-    )
-    task.tune(tune_option, search_policy=search_policy)
-
-
-resume_search(task, log_file)
-
-################################################################################
-# Final Notes and Summary
-# -----------------------
-# In this tutorial, we have shown how to use the TVM Auto-Scheduler to
-# automatically optimize a matrix multiplication, without the need to specify a
-# search template.  It ends a series of examples that starts from the Tensor
-# Expression (TE) language that demonstrates how TVM can optimize computational
-# operations.
diff --git a/tutorials/get_started/autotvm_matmul_x86.py b/tutorials/get_started/autotvm_matmul_x86.py
deleted file mode 100644
index f9b33b894192..000000000000
--- a/tutorials/get_started/autotvm_matmul_x86.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _tutorial-autotvm-matmul-x86:
-
-Optimizing Operators with Schedule Templates and AutoTVM
-========================================================
-**Authors**:
-`Lianmin Zheng <https://github.com/merrymercy>`_,
-`Chris Hoge <https://github.com/hogepodge>`_
-
-In this tutorial, we show how the TVM Tensor Expression (TE) language
-can be used to write schedule templates that can be searched by AutoTVM to
-find the optimal schedule. This process is called Auto-Tuning, which helps
-automate the process of optimizing tensor computation.
-
-This tutorial builds on the previous `tutorial on how to write a matrix
-multiplication using TE <tensor_expr_get_started>`.
-
-There are two steps in auto-tuning.
-
-- The first step is defining a search space.
-- The second step is running a search algorithm to explore through this space.
-
-In this tutorial, you can learn how to perform these two steps in TVM. The whole
-workflow is illustrated by a matrix multiplication example.
-
-.. note::
-  Note that this tutorial will not run on Windows or recent versions of macOS.
-  To get it to run, you will need to wrap the body of this tutorial in a
-  :code:`if __name__ == "__main__":` block.
-"""
-
-################################################################################
-# Install dependencies
-# --------------------
-# To use autotvm package in TVM, we need to install some extra dependencies.
-#
-# .. code-block:: bash
-#
-#   pip3 install --user psutil xgboost cloudpickle
-#
-# To make TVM run faster in tuning, it is recommended to use cython as FFI of
-# TVM. In the root directory of TVM, execute:
-#
-# .. code-block:: bash
-#
-#   pip3 install --user cython
-#   sudo make cython3
-#
-# Now return to python code. Begin by importing the required packages.
-
-import logging
-import sys
-
-import numpy as np
-import tvm
-from tvm import te
-import tvm.testing
-
-# the module is called `autotvm`
-from tvm import autotvm
-
-################################################################################
-# Basic Matrix Multiplication with TE
-# -----------------------------------
-# Recall the basic implementation of matrix multiplication using TE. We write
-# it down here with a few changes. We will wrap the multiplication in a python
-# function definition. For simplicity, we will focus our attention on a split
-# optimization, using a fixed value that defines the block size of the
-# reordering.
-
-
-def matmul_basic(N, L, M, dtype):
-
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-    s = te.create_schedule(C.op)
-
-    # schedule
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    yo, yi = s[C].split(y, 8)
-    xo, xi = s[C].split(x, 8)
-
-    s[C].reorder(yo, xo, k, yi, xi)
-
-    return s, [A, B, C]
-
-
-################################################################################
-# Matrix Multiplication with AutoTVM
-# ----------------------------------
-# In the previous schedule code, we use a constant "8" as the tiling factor.
-# However, it might not be the best one because the best tiling factor depends
-# on real hardware environment and input shape.
-#
-# If you want the schedule code to be portable across a wider range of input
-# shapes and target hardware, it is better to define a set of candidate values
-# and pick the best one according to the measurement results on target
-# hardware.
-#
-# In autotvm, we can define a tunable parameter, or a "knob" for such kind of
-# value.
-
-################################################################################
-# A Basic Matrix Multiplication Template
-# --------------------------------------
-# We begin with an example of how to create a tunable parameter set for the
-# block size of the `split` scheduling operation.
-
-# Matmul V1: List candidate values
-@autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
-def matmul_v1(N, L, M, dtype):
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-    s = te.create_schedule(C.op)
-
-    # schedule
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    # 2. get the config object
-    cfg = autotvm.get_config()
-
-    # 3. define search space
-    cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
-    cfg.define_knob("tile_x", [1, 2, 4, 8, 16])
-
-    # 4. schedule according to config
-    yo, yi = s[C].split(y, cfg["tile_y"].val)
-    xo, xi = s[C].split(x, cfg["tile_x"].val)
-
-    s[C].reorder(yo, xo, k, yi, xi)
-
-    return s, [A, B, C]
-
-
-################################################################################
-# Here we make four modifications to the previous schedule code and get a
-# tunable "template". We can explain the modifications one by one.
-#
-# 1. Use a decorator to mark this function as a simple template.
-# 2. Get a config object: You can regard this :code:`cfg` as an argument of
-#    this function but we obtain it in a different way. With this argument, this
-#    function is no longer a deterministic schedule. Instead, we can pass
-#    different configurations to this function and get different schedules. A
-#    function that uses a configuration object like this is called a "template".
-#
-#    To make the template function more compact, we can do two things to define
-#    the parameter search space within a single function.
-#
-#    1. Define a search space across a set values. This is done by making
-#       :code:`cfg` a :any:`ConfigSpace` object. It will collect all of the
-#       tunable knobs in this function and build a search space from it.
-#    2. Schedule according to an entity in this space. This is done by making
-#       :code:`cfg` a :any:`ConfigEntity` object. When it is a
-#       :any:`ConfigEntity`, it will ignore all space definition API (namely,
-#       :code:`cfg.define_XXXXX(...)`). Instead, it will store deterministic
-#       values for all tunable knobs, and we schedule according to these values.
-#
-#    During auto-tuning, we will first call this template with a
-#    :any:`ConfigSpace` object to build the search space. Then we call this
-#    template with different :any:`ConfigEntity` in the built space to get
-#    different schedules. Finally we will measure the code generated by
-#    different schedules and pick the best one.
-#
-# 3. Define two tunable knobs. The first one is :code:`tile_y` with 5 possible
-#    values. The second one is :code:`tile_x` with a same list of possible values.
-#    These two knobs are independent, so they span a search space with size 25 =
-#    5x5.
-# 4. The configuration knobs are passed to the :code:`split` schedule
-#    operation, allowing us to schedule according to the 5x5 deterministic values
-#    we previously defined in :code:`cfg`.
-
-################################################################################
-# A Matrix Multiplication Template with the Advanced Parameter API
-# ----------------------------------------------------------------
-# In the previous template, we manually listed all of the possible values for a
-# knob. This is the lowest level API to define the space, and gives an explicit
-# enumeration of the parameter space to search. However, we also provide
-# another set of APIs that can make the definition of the search space easier
-# and smarter. Where possible, we receomment you use this higher-level API
-#
-# In the following example, we use :any:`ConfigSpace.define_split` to define a
-# split knob. It will enumerate all the possible ways to split an axis and
-# construct the space.
-#
-# We also have :any:`ConfigSpace.define_reorder` for reorder knob and
-# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
-# thread binding. When the high level API cannot meet your requirements, you
-# can always fall back to using the low level API.
-
-
-@autotvm.template("tutorial/matmul")
-def matmul(N, L, M, dtype):
-    A = te.placeholder((N, L), name="A", dtype=dtype)
-    B = te.placeholder((L, M), name="B", dtype=dtype)
-
-    k = te.reduce_axis((0, L), name="k")
-    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
-    s = te.create_schedule(C.op)
-
-    # schedule
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    ##### define space begin #####
-    cfg = autotvm.get_config()
-    cfg.define_split("tile_y", y, num_outputs=2)
-    cfg.define_split("tile_x", x, num_outputs=2)
-    ##### define space end #####
-
-    # schedule according to config
-    yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-
-    s[C].reorder(yo, xo, k, yi, xi)
-
-    return s, [A, B, C]
-
-
-################################################################################
-# .. note:: More Explanation on :code:`cfg.define_split`
-#
-#  In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will
-#  enumerate all possible combinations that can split axis y into two axes with
-#  factors of the length of y. For example, if the length of y is 32 and we
-#  want to split it into two axes using factors of 32, then there are 6
-#  possible values for (length of outer axis, length of inner axis) pair,
-#  namely (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). These are all 6
-#  possible values of `tile_y`.
-#
-#  During scheduling, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
-#  We stores the lengths of outer axes and inner axes in
-#  :code:`cfg['tile_y'].size` (a tuple with two elements).  In this template,
-#  we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
-#  Actually, this is equivalent to :code:`yo, yi = s[C].split(y,
-#  cfg["tile_y"].size[1])` or  :code:`yo, yi = s[C].split(y,
-#  nparts=cfg['tile_y"].size[0])`
-#
-#  The advantage of using cfg.apply API is that it makes multi-level splits
-#  (that is, when num_outputs >= 3) easier.
-
-################################################################################
-# Step 2: Use AutoTVM to Optimize the Matrix Multiplication
-# ---------------------------------------------------------
-# In Step 1, we wrote a matrix multiplication template that allowed us to
-# paramaterize the block size used in the `split` schedule. We can now conduct
-# a search over this parameter space. The next step is to pick a tuner to guide
-# the exploration of this space.
-#
-# Auto-tuners in TVM
-# ~~~~~~~~~~~~~~~~~~
-# The job for a tuner can be described by following pseudo code
-#
-#   .. code-block:: c
-#
-#    ct = 0
-#    while ct < max_number_of_trials:
-#        propose a batch of configs
-#        measure this batch of configs on real hardware and get results
-#        ct += batch_size
-#
-# When proposing the next batch of configs, the tuner can take different
-# strategies. Some of the tuner strategies provided by TVM include:
-#
-# * :any:`tvm.autotvm.tuner.RandomTuner`: Enumerate the space in a random order
-# * :any:`tvm.autotvm.tuner.GridSearchTuner`: Enumerate the space in a grid search order
-# * :any:`tvm.autotvm.tuner.GATuner`: Using genetic algorithm to search through the space
-# * :any:`tvm.autotvm.tuner.XGBTuner`: Uses a model based method. Train a XGBoost model to
-#   predict the speed of lowered IR and pick the next batch according to the
-#   prediction.
-#
-# You can choose the tuner according to the size of your space, your time
-# budget and other factors.  For example, if your space is very small (less
-# than 1000), a gridsearch tuner or a random tuner is good enough. If your
-# space is at the level of 10^9 (this is the space size of a conv2d operator on
-# CUDA GPU), XGBoostTuner can explore more efficiently and find better configs.
-
-################################################################################
-# Begin tuning
-# ~~~~~~~~~~~~
-# Here we continue our matrix multiplication example. First we create a tuning
-# task. We can also inspect the initialized search space. In this case, for a
-# 512x512 square matrix multiplication, the space size is 10x10=100 Note that
-# the task and search space are independent of the tuner picked.
-
-N, L, M = 512, 512, 512
-task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
-print(task.config_space)
-
-################################################################################
-# Then we need to define how to measure the generated code and pick a tuner.
-# Since our space is small, a random tuner is just okay.
-#
-# We only make 10 trials in this tutorial for demonstration. In practice, you
-# can do more trials according to your time budget. We will log the tuning
-# results into a log file. This file can be used to choose the best
-# configuration discovered by the tuner later.
-
-# logging config (for printing tuning log to the screen)
-logging.getLogger("autotvm").setLevel(logging.DEBUG)
-logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))
-
-################################################################################
-# There are two steps for measuring a config: build and run. By default, we use
-# all CPU cores to compile program. We then measure them sequentially. To help
-# reduce variance, we take 5 measurements and average them.
-measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))
-
-# Begin tuning with RandomTuner, log records to file `matmul.log`
-# You can use alternatives like XGBTuner.
-tuner = autotvm.tuner.RandomTuner(task)
-tuner.tune(
-    n_trial=10,
-    measure_option=measure_option,
-    callbacks=[autotvm.callback.log_to_file("matmul.log")],
-)
-
-################################################################################
-# With tuning completed, we can choose the configuration from the log file that
-# has the best measured performance and compile the schedule with the
-# corresponding parameters. We also do a quick verfication that the schedule is
-# producing correct answers.  We can call the function :code:`matmul` directly
-# under the :any:`autotvm.apply_history_best` context. When we call this
-# function, it will query the dispatch context with its argument and get the
-# best config with the same argument.
-
-# apply history best from log file
-with autotvm.apply_history_best("matmul.log"):
-    with tvm.target.Target("llvm"):
-        s, arg_bufs = matmul(N, L, M, "float32")
-        func = tvm.build(s, arg_bufs)
-
-# check correctness
-a_np = np.random.uniform(size=(N, L)).astype(np.float32)
-b_np = np.random.uniform(size=(L, M)).astype(np.float32)
-c_np = a_np.dot(b_np)
-
-c_tvm = tvm.nd.empty(c_np.shape)
-func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
-
-tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-4)
-
-################################################################################
-# Final Notes and Summary
-# -----------------------
-# In this tutorial, we have shown how to build operator templates that allow
-# TVM to search a parameter space and choose optimized schedule configurations.
-# To gain a deeper understanding of how this works, we recommend expanding on
-# this example by adding new search parameters to the schedule based on
-# schedule operations demonstated in the `Getting Started With Tensor
-# Expressions <tensor_expr_get_started>_` tutorial. In the upcoming sections, we
-# will demonstate the AutoScheduler, a method for TVM to optimize common
-# operators without the need for the user to provide a user-defined template.
diff --git a/tutorials/get_started/autotvm_relay_x86.py b/tutorials/get_started/autotvm_relay_x86.py
deleted file mode 100644
index 67faec4505a6..000000000000
--- a/tutorials/get_started/autotvm_relay_x86.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Compiling and Optimizing a Model with the Python Interface (AutoTVM)
-====================================================================
-**Author**:
-`Chris Hoge <https://github.com/hogepodge>`_
-
-In the `TVMC Tutorial <tvmc_command_line_driver>`_, we covered how to compile, run, and tune a
-pre-trained vision model, ResNet-50-v2 using the command line interface for
-TVM, TVMC. TVM is more that just a command-line tool though, it is an
-optimizing framework with APIs available for a number of different languages
-that gives you tremendous flexibility in working with machine learning models.
-
-In this tutorial we will cover the same ground we did with TVMC, but show how
-it is done with the Python API. Upon completion of this section, we will have
-used the Python API for TVM to accomplish the following tasks:
-
-* Compile a pre-trained ResNet 50 v2 model for the TVM runtime.
-* Run a real image through the compiled model, and interpret the output and model
-  performance.
-* Tune the model that model on a CPU using TVM.
-* Re-compile an optimized model using the tuning data collected by TVM.
-* Run the image through the optimized model, and compare the output and model
-  performance.
-
-The goal of this section is to give you an overview of TVM's capabilites and
-how to use them through the Python API.
-"""
-
-################################################################################
-# TVM is a deep learning compiler framework, with a number of different modules
-# available for working with deep learning models and operators. In this
-# tutorial we will work through how to load, compile, and optimize a model
-# using the Python API.
-#
-# We begin by importing a number of dependencies, including ``onnx`` for
-# loading and converting the model, helper utilities for downloading test data,
-# the Python Image Library for working with the image data, ``numpy`` for pre
-# and post-processing of the image data, the TVM Relay framework, and the TVM
-# Graph Executor.
-
-import onnx
-from tvm.contrib.download import download_testdata
-from PIL import Image
-import numpy as np
-import tvm.relay as relay
-import tvm
-from tvm.contrib import graph_executor
-
-################################################################################
-# Downloading and Loading the ONNX Model
-# --------------------------------------
-#
-# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a
-# convolutional neural network that is 50-layers deep and designed to classify
-# images. The model we will be using has been pre-trained on more than a
-# million images with 1000 different classifications. The network has an input
-# image size of 224x224. If you are interested exploring more of how the
-# ResNet-50 model is structured, we recommend downloading
-# `Netron <https://netron.app>`_, a freely available ML model viewer.
-#
-# TVM provides a helper library to download pre-trained models. By providing a
-# model URL, file name, and model type through the module, TVM will download
-# the model and save it to disk. For the instance of an ONNX model, you can
-# then load it into memory using the ONNX runtime.
-#
-# .. note:: Working with Other Model Formats
-#
-#   TVM supports many popular model formats. A list can be found in the `Compile
-#   Deep Learning Models
-#   <https://tvm.apache.org/docs/tutorials/index.html#compile-deep-learning-models>`_
-#   section of the TVM Documentation.
-
-model_url = "".join(
-    [
-        "https://github.com/onnx/models/raw/",
-        "master/vision/classification/resnet/model/",
-        "resnet50-v2-7.onnx",
-    ]
-)
-
-model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx")
-onnx_model = onnx.load(model_path)
-
-################################################################################
-# Downloading, Preprocessing, and Loading the Test Image
-# ------------------------------------------------------
-#
-# Each model is particular when it comes to expected tensor shapes, formats and
-# data types. For this reason, most models require some pre and
-# post-processing, to ensure the input is valid and to interpret the output.
-# TVMC has adopted NumPy's ``.npz`` format for both input and output data.
-#
-# As input for this tutorial, we will use the image of a cat, but you can feel
-# free to substitute image for any of your choosing.
-#
-# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg
-#    :height: 224px
-#    :width: 224px
-#    :align: center
-#
-# Download the image data, then convert it to a numpy array to use as an input to the model.
-
-img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
-img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
-
-# Resize it to 224x224
-resized_image = Image.open(img_path).resize((224, 224))
-img_data = np.asarray(resized_image).astype("float32")
-
-# Our input image is in HWC layout while ONNX expects CHW input, so convert the array
-img_data = np.transpose(img_data, (2, 0, 1))
-
-# Normalize according to the ImageNet input specification
-imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev
-
-# Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
-img_data = np.expand_dims(norm_img_data, axis=0)
-
-###############################################################################
-# Compile the Model With Relay
-# ----------------------------
-#
-# The next step is to compile the ResNet model. We begin by importing the model
-# to relay using the `from_onnx` importer. We then build the model, with
-# standard optimizations, into a TVM library.  Finally, we create a TVM graph
-# runtime module from the library.
-
-target = "llvm"
-
-######################################################################
-# .. note:: Defining the Correct Target
-#
-#   Specifying the correct target can have a huge impact on the performance of
-#   the compiled module, as it can take advantage of hardware features
-#   available on the target. For more information, please refer to `Auto-tuning
-#   a convolutional network for x86 CPU
-#   <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
-#   We recommend identifying which CPU you are running, along with optional
-#   features, and set the target appropriately. For example, for some
-#   processors ``target = "llvm -mcpu=skylake"``, or ``target = "llvm
-#   -mcpu=skylake-avx512"`` for processors with the AVX-512 vector instruction
-#   set.
-#
-
-# The input name may vary across model types. You can use a tool
-# like netron to check input names
-input_name = "data"
-shape_dict = {input_name: img_data.shape}
-
-mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
-
-with tvm.transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target=target, params=params)
-
-dev = tvm.device(str(target), 0)
-module = graph_executor.GraphModule(lib["default"](dev))
-
-######################################################################
-# Execute on the TVM Runtime
-# --------------------------
-# Now that we've compiled the model, we can use the TVM runtime to make
-# predictions with it. To use TVM to run the model and make predictions, we
-# need two things:
-#
-# - The compiled model, which we just produced.
-# - Valid input to the model to make predictions on.
-
-dtype = "float32"
-module.set_input(input_name, img_data)
-module.run()
-output_shape = (1, 1000)
-tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
-
-################################################################################
-# Collect Basic Performance Data
-# ------------------------------
-# We want to collect some basic performance data associated with this
-# unoptimized model and compare it to a tuned model later. To help account for
-# CPU noise, we run the computation in multiple batches in multiple
-# repetitions, then gather some basis statistics on the mean, median, and
-# standard deviation.
-import timeit
-
-timing_number = 10
-timing_repeat = 10
-unoptimized = (
-    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
-    * 1000
-    / timing_number
-)
-unoptimized = {
-    "mean": np.mean(unoptimized),
-    "median": np.median(unoptimized),
-    "std": np.std(unoptimized),
-}
-
-print(unoptimized)
-
-################################################################################
-# Postprocess the output
-# ----------------------
-#
-# As previously mentioned, each model will have its own particular way of
-# providing output tensors.
-#
-# In our case, we need to run some post-processing to render the outputs from
-# ResNet-50-V2 into a more human-readable form, using the lookup-table provided
-# for the model.
-
-from scipy.special import softmax
-
-# Download a list of labels
-labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
-labels_path = download_testdata(labels_url, "synset.txt", module="data")
-
-with open(labels_path, "r") as f:
-    labels = [l.rstrip() for l in f]
-
-# Open the output and read the output tensor
-scores = softmax(tvm_output)
-scores = np.squeeze(scores)
-ranks = np.argsort(scores)[::-1]
-for rank in ranks[0:5]:
-    print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
-
-################################################################################
-# This should produce the following output:
-#
-# .. code-block:: bash
-#
-#     # class='n02123045 tabby, tabby cat' with probability=0.610553
-#     # class='n02123159 tiger cat' with probability=0.367179
-#     # class='n02124075 Egyptian cat' with probability=0.019365
-#     # class='n02129604 tiger, Panthera tigris' with probability=0.001273
-#     # class='n04040759 radiator' with probability=0.000261
-
-################################################################################
-# Tune the model
-# --------------
-# The previous model was compiled to work on the TVM runtime, but did not
-# include any platform specific optimization. In this section, we will show you
-# how to build an optimized model using TVM to target your working platform.
-#
-# In some cases, we might not get the expected performance when running
-# inferences using our compiled module. In cases like this, we can make use of
-# the auto-tuner, to find a better configuration for our model and get a boost
-# in performance. Tuning in TVM refers to the process by which a model is
-# optimized to run faster on a given target. This differs from training or
-# fine-tuning in that it does not affect the accuracy of the model, but only
-# the runtime performance. As part of the tuning process, TVM will try running
-# many different operator implementation variants to see which perform best.
-# The results of these runs are stored in a tuning records file.
-#
-# In the simplest form, tuning requires you to provide three things:
-#
-# - the target specification of the device you intend to run this model on
-# - the path to an output file in which the tuning records will be stored
-# - a path to the model to be tuned.
-#
-
-import tvm.auto_scheduler as auto_scheduler
-from tvm.autotvm.tuner import XGBTuner
-from tvm import autotvm
-
-# Set up some basic parameters for the runner. The runner takes compiled code
-# that is generated with a specific set of parameters and measures the
-# performance of it. ``number`` specifies the number of different
-# configurations that we will test, while ``repeat`` specifies how many
-# measurements we will take of each configuration. ``min_repeat_ms`` is a value
-# that specifies how long need to run configuration test. If the number of
-# repeats falls under this time, it will be increased. This option is necessary
-# for accurate tuning on GPUs, and is not required for CPU tuning. Setting this
-# value to 0 disables it. The ``timeout`` places an upper limit on how long to
-# run training code for each tested configuration.
-
-number = 10
-repeat = 1
-min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
-timeout = 10  # in seconds
-
-# create a TVM runner
-runner = autotvm.LocalRunner(
-    number=number,
-    repeat=repeat,
-    timeout=timeout,
-    min_repeat_ms=min_repeat_ms,
-    enable_cpu_cache_flush=True,
-)
-
-# Create a simple structure for holding tuning options. We use an XGBoost
-# algorithim for guiding the search. For a production job, you will want to set
-# the number of trials to be larger than the value of 10 used here. For CPU we
-# recommend 1500, for GPU 3000-4000. The number of trials required can depend
-# on the particular model and processor, so it's worth spending some time
-# evaluating performance across a range of values to find the best balance
-# between tuning time and model optimization. Because running tuning is time
-# intensive we set number of trials to 10, but do not recommend a value this
-# small. The ``early_stopping`` parameter is the minimum number of trails to
-# run before a condition that stops the search early can be applied. The
-# measure option indicates where trial code will be built, and where it will be
-# run. In this case, we're using the ``LocalRunner`` we just created and a
-# ``LocalBuilder``. The ``tuning_records`` option specifies a file to write
-# the tuning data to.
-
-tuning_option = {
-    "tuner": "xgb",
-    "trials": 10,
-    "early_stopping": 100,
-    "measure_option": autotvm.measure_option(
-        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
-    ),
-    "tuning_records": "resnet-50-v2-autotuning.json",
-}
-
-################################################################################
-# .. note:: Defining the Tuning Search Algorithm
-#
-#   By default this search is guided using an `XGBoost Grid` algorithm.
-#   Depending on your model complexity and amount of time available, you might
-#   want to choose a different algorithm.
-
-
-################################################################################
-# .. note:: Setting Tuning Parameters
-#
-#   In this example, in the interest of time, we set the number of trials and
-#   early stopping to 10. You will likely see more performance improvements if
-#   you set these values to be higher but this comes at the expense of time
-#   spent tuning. The number of trials required for convergence will vary
-#   depending on the specifics of the model and the target platform.
-
-# begin by extracting the taks from the onnx model
-tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)
-
-# Tune the extracted tasks sequentially.
-for i, task in enumerate(tasks):
-    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
-    tuner_obj = XGBTuner(task, loss_type="rank")
-    tuner_obj.tune(
-        n_trial=min(tuning_option["trials"], len(task.config_space)),
-        early_stopping=tuning_option["early_stopping"],
-        measure_option=tuning_option["measure_option"],
-        callbacks=[
-            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
-            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
-        ],
-    )
-
-################################################################################
-# The output from this tuning process will look something like this:
-#
-# .. code-block:: bash
-#
-#   # [Task  1/24]  Current/Best:   10.71/  21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done.
-#   # [Task  1/24]  Current/Best:    9.32/  24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done.
-#   # [Task  2/24]  Current/Best:   22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done.
-#   # [Task  3/24]  Current/Best:   32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done.
-#   # [Task  4/24]  Current/Best:   11.96/ 156.49 GFLOPS | Progress: (960/1000) | 632.26 s Done.
-#   # [Task  5/24]  Current/Best:   23.75/ 130.78 GFLOPS | Progress: (800/1000) | 739.29 s Done.
-#   # [Task  6/24]  Current/Best:   38.29/ 198.31 GFLOPS | Progress: (1000/1000) | 624.51 s Done.
-#   # [Task  7/24]  Current/Best:    4.31/ 210.78 GFLOPS | Progress: (1000/1000) | 701.03 s Done.
-#   # [Task  8/24]  Current/Best:   50.25/ 185.35 GFLOPS | Progress: (972/1000) | 538.55 s Done.
-#   # [Task  9/24]  Current/Best:   50.19/ 194.42 GFLOPS | Progress: (1000/1000) | 487.30 s Done.
-#   # [Task 10/24]  Current/Best:   12.90/ 172.60 GFLOPS | Progress: (972/1000) | 607.32 s Done.
-#   # [Task 11/24]  Current/Best:   62.71/ 203.46 GFLOPS | Progress: (1000/1000) | 581.92 s Done.
-#   # [Task 12/24]  Current/Best:   36.79/ 224.71 GFLOPS | Progress: (1000/1000) | 675.13 s Done.
-#   # [Task 13/24]  Current/Best:    7.76/ 219.72 GFLOPS | Progress: (1000/1000) | 519.06 s Done.
-#   # [Task 14/24]  Current/Best:   12.26/ 202.42 GFLOPS | Progress: (1000/1000) | 514.30 s Done.
-#   # [Task 15/24]  Current/Best:   31.59/ 197.61 GFLOPS | Progress: (1000/1000) | 558.54 s Done.
-#   # [Task 16/24]  Current/Best:   31.63/ 206.08 GFLOPS | Progress: (1000/1000) | 708.36 s Done.
-#   # [Task 17/24]  Current/Best:   41.18/ 204.45 GFLOPS | Progress: (1000/1000) | 736.08 s Done.
-#   # [Task 18/24]  Current/Best:   15.85/ 222.38 GFLOPS | Progress: (980/1000) | 516.73 s Done.
-#   # [Task 19/24]  Current/Best:   15.78/ 203.41 GFLOPS | Progress: (1000/1000) | 587.13 s Done.
-#   # [Task 20/24]  Current/Best:   30.47/ 205.92 GFLOPS | Progress: (980/1000) | 471.00 s Done.
-#   # [Task 21/24]  Current/Best:   46.91/ 227.99 GFLOPS | Progress: (308/1000) | 219.18 s Done.
-#   # [Task 22/24]  Current/Best:   13.33/ 207.66 GFLOPS | Progress: (1000/1000) | 761.74 s Done.
-#   # [Task 23/24]  Current/Best:   53.29/ 192.98 GFLOPS | Progress: (1000/1000) | 799.90 s Done.
-#   # [Task 24/24]  Current/Best:   25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done.
-
-################################################################################
-# Compiling an Optimized Model with Tuning Data
-# ----------------------------------------------
-#
-# As an output of the tuning process above, we obtained the tuning records
-# stored in ``resnet-50-v2-autotuning.json``. The compiler will use the results to
-# generate high performance code for the model on your specified target.
-#
-# Now that tuning data for the model has been collected, we can re-compile the
-# model using optimized operators to speed up our computations.
-
-with autotvm.apply_history_best(tuning_option["tuning_records"]):
-    with tvm.transform.PassContext(opt_level=3, config={}):
-        lib = relay.build(mod, target=target, params=params)
-
-dev = tvm.device(str(target), 0)
-module = graph_executor.GraphModule(lib["default"](dev))
-
-################################################################################
-# Verify that the optimized model runs and produces the same results:
-
-dtype = "float32"
-module.set_input(input_name, img_data)
-module.run()
-output_shape = (1, 1000)
-tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()
-
-scores = softmax(tvm_output)
-scores = np.squeeze(scores)
-ranks = np.argsort(scores)[::-1]
-for rank in ranks[0:5]:
-    print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
-
-# Verifying that the predictions are the same:
-#
-# .. code-block:: bash
-#
-#   # class='n02123045 tabby, tabby cat' with probability=0.610550
-#   # class='n02123159 tiger cat' with probability=0.367181
-#   # class='n02124075 Egyptian cat' with probability=0.019365
-#   # class='n02129604 tiger, Panthera tigris' with probability=0.001273
-#   # class='n04040759 radiator' with probability=0.000261
-
-################################################################################
-# Comparing the Tuned and Untuned Models
-# --------------------------------------
-# We want to collect some basic performance data associated with this optimized
-# model to compare it to the unoptimized model. Depending on your underlying
-# hardware, number of iterations, and other factors, you should see a performance
-# improvement in comparing the optimized model to the unoptimized model.
-
-import timeit
-
-timing_number = 10
-timing_repeat = 10
-optimized = (
-    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
-    * 1000
-    / timing_number
-)
-optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)}
-
-
-print("optimized: %s" % (optimized))
-print("unoptimized: %s" % (unoptimized))
-
-################################################################################
-# Final Remarks
-# -------------
-#
-# In this tutorial, we gave a short example of how to use the TVM Python API
-# to compile, run, and tune a model. We also discussed the need for pre and
-# post-processing of inputs and outputs. After the tuning process, we
-# demonstrated how to compare the performance of the unoptimized and optimize
-# models.
-#
-# Here we presented a simple example using ResNet 50 V2 locally. However, TVM
-# supports many more features including cross-compilation, remote execution and
-# profiling/benchmarking.
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py
deleted file mode 100644
index 25208369f74d..000000000000
--- a/tutorials/get_started/cross_compilation_and_rpc.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _tutorial-cross-compilation-and-rpc:
-
-Cross Compilation and RPC
-=========================
-**Author**: `Ziheng Jiang <https://github.com/ZihengJiang/>`_, `Lianmin Zheng <https://github.com/merrymercy/>`_
-
-This tutorial introduces cross compilation and remote device
-execution with RPC in TVM.
-
-With cross compilation and RPC, you can **compile a program on your
-local machine then run it on the remote device**. It is useful when
-the remote device resource are limited, like Raspberry Pi and mobile
-platforms. In this tutorial, we will use the Raspberry Pi for a CPU example
-and the Firefly-RK3399 for an OpenCL example.
-"""
-
-######################################################################
-# Build TVM Runtime on Device
-# ---------------------------
-#
-# The first step is to build the TVM runtime on the remote device.
-#
-# .. note::
-#
-#   All instructions in both this section and the next section should be
-#   executed on the target device, e.g. Raspberry Pi.  We assume the target
-#   is running Linux.
-#
-# Since we do compilation on the local machine, the remote device is only used
-# for running the generated code. We only need to build the TVM runtime on
-# the remote device.
-#
-# .. code-block:: bash
-#
-#   git clone --recursive https://github.com/apache/tvm tvm
-#   cd tvm
-#   make runtime -j2
-#
-# After building the runtime successfully, we need to set environment variables
-# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
-# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM
-# directory is in :code:`~/tvm`):
-#
-# .. code-block:: bash
-#
-#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
-#
-# To update the environment variables, execute :code:`source ~/.bashrc`.
-
-######################################################################
-# Set Up RPC Server on Device
-# ---------------------------
-# To start an RPC server, run the following command on your remote device
-# (Which is Raspberry Pi in this example).
-#
-#   .. code-block:: bash
-#
-#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
-#
-# If you see the line below, it means the RPC server started
-# successfully on your device.
-#
-#    .. code-block:: bash
-#
-#      INFO:root:RPCServer: bind to 0.0.0.0:9090
-#
-
-######################################################################
-# Declare and Cross Compile Kernel on Local Machine
-# -------------------------------------------------
-#
-# .. note::
-#
-#   Now we go back to the local machine, which has a full TVM installed
-#   (with LLVM).
-#
-# Here we will declare a simple kernel on the local machine:
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import rpc
-from tvm.contrib import utils
-
-n = tvm.runtime.convert(1024)
-A = te.placeholder((n,), name="A")
-B = te.compute((n,), lambda i: A[i] + 1.0, name="B")
-s = te.create_schedule(B.op)
-
-######################################################################
-# Then we cross compile the kernel.
-# The target should be 'llvm -mtriple=armv7l-linux-gnueabihf' for
-# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
-# on our webpage building server. See the detailed note in the following block.
-
-local_demo = True
-
-if local_demo:
-    target = "llvm"
-else:
-    target = "llvm -mtriple=armv7l-linux-gnueabihf"
-
-func = tvm.build(s, [A, B], target=target, name="add_one")
-# save the lib at a local temp folder
-temp = utils.tempdir()
-path = temp.relpath("lib.tar")
-func.export_library(path)
-
-######################################################################
-# .. note::
-#
-#   To run this tutorial with a real remote device, change :code:`local_demo`
-#   to False and replace :code:`target` in :code:`build` with the appropriate
-#   target triple for your device. The target triple which might be
-#   different for different devices. For example, it is
-#   :code:`'llvm -mtriple=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and
-#   :code:`'llvm -mtriple=aarch64-linux-gnu'` for RK3399.
-#
-#   Usually, you can query the target by running :code:`gcc -v` on your
-#   device, and looking for the line starting with :code:`Target:`
-#   (Though it may still be a loose configuration.)
-#
-#   Besides :code:`-mtriple`, you can also set other compilation options
-#   like:
-#
-#   * -mcpu=<cpuname>
-#       Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture.
-#   * -mattr=a1,+a2,-a3,...
-#       Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU.
-#       To get the list of available attributes, you can do:
-#
-#       .. code-block:: bash
-#
-#         llc -mtriple=<your device target triple> -mattr=help
-#
-#   These options are consistent with `llc <http://llvm.org/docs/CommandGuide/llc.html>`_.
-#   It is recommended to set target triple and feature set to contain specific
-#   feature available, so we can take full advantage of the features of the
-#   board.
-#   You can find more details about cross compilation attributes from
-#   `LLVM guide of cross compilation <https://clang.llvm.org/docs/CrossCompilation.html>`_.
-
-######################################################################
-# Run CPU Kernel Remotely by RPC
-# ------------------------------
-# We show how to run the generated CPU kernel on the remote device.
-# First we obtain an RPC session from remote device.
-
-if local_demo:
-    remote = rpc.LocalSession()
-else:
-    # The following is my environment, change this to the IP address of your target device
-    host = "10.77.1.162"
-    port = 9090
-    remote = rpc.connect(host, port)
-
-######################################################################
-# Upload the lib to the remote device, then invoke a device local
-# compiler to relink them. Now `func` is a remote module object.
-
-remote.upload(path)
-func = remote.load_module("lib.tar")
-
-# create arrays on the remote device
-dev = remote.cpu()
-a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-# the function will run on the remote device
-func(a, b)
-np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-
-######################################################################
-# When you want to evaluate the performance of the kernel on the remote
-# device, it is important to avoid the overhead of network.
-# :code:`time_evaluator` will returns a remote function that runs the
-# function over number times, measures the cost per run on the remote
-# device and returns the measured cost. Network overhead is excluded.
-
-time_f = func.time_evaluator(func.entry_name, dev, number=10)
-cost = time_f(a, b).mean
-print("%g secs/op" % cost)
-
-#########################################################################
-# Run OpenCL Kernel Remotely by RPC
-# ---------------------------------
-# For remote OpenCL devices, the workflow is almost the same as above.
-# You can define the kernel, upload files, and run via RPC.
-#
-# .. note::
-#
-#    Raspberry Pi does not support OpenCL, the following code is tested on
-#    Firefly-RK3399. You may follow this `tutorial <https://gist.github.com/mli/585aed2cec0b5178b1a510f9f236afa2>`_
-#    to setup the OS and OpenCL driver for RK3399.
-#
-#    Also we need to build the runtime with OpenCL enabled on rk3399 board. In the TVM
-#    root directory, execute
-#
-# .. code-block:: bash
-#
-#    cp cmake/config.cmake .
-#    sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake
-#    make runtime -j4
-#
-# The following function shows how we run an OpenCL kernel remotely
-
-
-def run_opencl():
-    # NOTE: This is the setting for my rk3399 board. You need to modify
-    # them according to your environment.
-    opencl_device_host = "10.77.1.145"
-    opencl_device_port = 9090
-    target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu")
-
-    # create schedule for the above "add one" compute declaration
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=32)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    func = tvm.build(s, [A, B], target=target)
-
-    remote = rpc.connect(opencl_device_host, opencl_device_port)
-
-    # export and upload
-    path = temp.relpath("lib_cl.tar")
-    func.export_library(path)
-    remote.upload(path)
-    func = remote.load_module("lib_cl.tar")
-
-    # run
-    dev = remote.cl()
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-    func(a, b)
-    np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-    print("OpenCL test passed!")
-
-
-######################################################################
-# Summary
-# -------
-# This tutorial provides a walk through of cross compilation and RPC
-# features in TVM.
-#
-# - Set up an RPC server on the remote device.
-# - Set up the target device configuration to cross compile the kernels on the
-#   local machine.
-# - Upload and run the kernels remotely via the RPC API.
diff --git a/tutorials/get_started/install.py b/tutorials/get_started/install.py
deleted file mode 100644
index b69b8b493a4f..000000000000
--- a/tutorials/get_started/install.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Installing TVM
-==============
-**Authors**:
-`Jocelyn Shiue <https://github.com/>`_,
-`Chris Hoge <https://github.com/hogepodge>`_
-
-Depending on your needs and your working environment, there are a few different
-methods for installing TVM. These include:
-
-* Installing from source
-* Installing from third-party binary package.
-"""
-
-################################################################################
-# Installing from Source
-# ----------------------
-# Installing from source is the recommended method for installing TVM. It will
-# allow you to enable specific features such as GPU support, microcontroller
-# support (microTVM), and a debugging runtime, and other features. You will also
-# want to install from source if you want to actively contribute to the TVM
-# project. The full instructions are on the `Install TVM From Source
-# <https://tvm.apache.org/docs/install/from_source.html>`_ page.
-
-################################################################################
-# Installing From Binary Packages
-# --------------------------------
-# You may install convenient third-party binary package distributions to
-# quickly try things out. TLCPack is a thirdparty volunteer community that
-# builds binary packages from TVM source. It offers support matrix with
-# instructions to install on different platforms, with different features.
-# Checkout  `TLCPack <https://tlcpack.ai>`_ to learn more. Note that the
-# thirdparty binary packages could contain additional licensing terms for
-# the hardware drivers that are bundled with it.
diff --git a/tutorials/get_started/introduction.py b/tutorials/get_started/introduction.py
deleted file mode 100644
index 0746c3983b61..000000000000
--- a/tutorials/get_started/introduction.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Introduction
-============
-**Authors**:
-`Jocelyn Shiue <https://github.com/>`_,
-`Chris Hoge <https://github.com/hogepodge>`_,
-`Lianmin Zheng <https://github.com/merrymercy>`_
-
-Apache TVM is an open source machine learning compiler framework for CPUs,
-GPUs, and machine learning accelerators. It aims to enable machine learning
-engineers to optimize and run computations efficiently on any hardware backend.
-The purpose of this tutorial is to take a guided tour through all of the major
-features of TVM by defining and demonstrating key concepts. A new user should
-be able to work through the tutorial from start to finish and be able to
-operate TVM for automatic model optimization, while having a basic
-understanding of the TVM architecture and how it works.
-
-Contents
---------
-
-#. :doc:`Introduction <introduction>`
-#. :doc:`Installing TVM <install>`
-#. :doc:`Compiling and Optimizing a Model with the Command Line Interface <tvmc_command_line_driver>`
-#. :doc:`Compiling and Optimizing a Model with the Python Interface <autotvm_relay_x86>`
-#. :doc:`Working with Operators Using Tensor Expression <tensor_expr_get_started>`
-#. :doc:`Optimizing Operators with Templates and AutoTVM <autotvm_matmul_x86>`
-#. :doc:`Optimizing Operators with Template-free AutoScheduler <auto_scheduler_matmul_x86>`
-#. :doc:`Cross Compilation and Remote Procedure Calls (RPC) <cross_compilation_and_rpc>`
-#. :doc:`Compiling Deep Learning Models for GPUs <relay_quick_start>`
-"""
-
-################################################################################
-# An Overview of TVM and Model Optimization
-# =========================================
-#
-# The diagram below illustrates the steps a machine model takes as it is
-# transformed with the TVM optimizing compiler framework.
-#
-# .. image:: https://raw.githubusercontent.com/apache/tvm-site/main/images/tutorial/overview.png
-#   :width: 100%
-#   :alt: A High Level View of TVM
-#
-# 1. Import the model from a framework like *Tensorflow*, *Pytorch*, or *Onnx*.
-#    The importer layer is where TVM can ingest models from other frameworks, like
-#    Tensorflow, PyTorch, or ONNX. The level of support that TVM offers for each
-#    frontend varies as we are constantly improving the open source project. If
-#    you're having issues importing your model into TVM, you may want to try
-#    converting it to ONNX.
-#
-# 2. Translate to *Relay*, TVM's high-level model language.
-#    A model that has been imported into TVM is represented in Relay. Relay is a
-#    functional language and intermediate representation (IR) for neural networks.
-#    It has support for:
-#
-#    - Traditional data flow-style representations
-#    - Functional-style scoping, let-binding which makes it a fully featured
-#      differentiable language
-#    - Ability to allow the user to mix the two programming styles
-#
-#    Relay applies graph-level optimization passes to optimize the model.
-#
-# 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a
-#    higher-level representation is transformed into a lower-level
-#    representation. After applying the high-level optimizations, Relay
-#    runs FuseOps pass to partition the model into many small subgraphs and lowers
-#    the subgraphs to TE representation. Tensor Expression (TE) is a
-#    domain-specific language for describing tensor computations.
-#    TE also provides several *schedule* primitives to specify low-level loop
-#    optimizations, such as tiling, vectorization, parallelization,
-#    unrolling, and fusion.
-#    To aid in the process of converting Relay representation into TE representation,
-#    TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined
-#    templates of common tensor operators (e.g., conv2d, transpose).
-#
-# 4. Search for the best schedule using the auto-tuning module *AutoTVM* or *AutoScheduler*.
-#    A schedule specifies the low-level loop optimizations for an operator or
-#    subgraph defined in TE. Auto-tuning modules search for the best schedule
-#    and compare them with cost models and on-device measurements.
-#    There are two auto-tuning modules in TVM.
-#
-#    - **AutoTVM**: A template-based auto-tuning module. It runs search algorithms
-#      to find the best values for the tunable knobs in a user-defined template.
-#      For common operators, their templates are already provided in TOPI.
-#    - **AutoScheduler (a.k.a. Ansor)**: A template-free auto-tuning module.
-#      It does not require pre-defined schedule templates. Instead, it generates
-#      the search space automatically by analyzing the computation definition.
-#      It then searches for the best schedule in the generated search space.
-#
-# 5. Choose the optimal configurations for model compilation. After tuning, the
-#    auto-tuning module generates tuning records in JSON format. This step
-#    picks the best schedule for each subgraph.
-#
-# 6. Lower to Tensor Intermediate Representation (TIR), TVM's low-level
-#    intermediate representation. After selecting the optimal configurations
-#    based on the tuning step, each TE subgraph is lowered to TIR and be
-#    optimized by low-level optimization passes. Next, the optimized TIR is
-#    lowered to the target compiler of the hardware platform.
-#    This is the final code generation phase to produce an optimized model
-#    that can be deployed into production. TVM supports several different
-#    compiler backends including:
-#
-#    - LLVM, which can target arbitrary microprocessor architecture including
-#      standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any
-#      other platform supported by LLVM.
-#    - Specialized compilers, such as NVCC, NVIDIA's compiler.
-#    - Embedded and specialized targets, which are implemented through TVM's
-#      Bring Your Own Codegen (BYOC) framework.
-#
-# 7. Compile down to machine code. At the end of this process, the
-#    compiler-specific generated code can be lowered to machine code.
-#
-#    TVM can compile models down to a linkable object module, which can then be
-#    run with a lightweight TVM runtime that provides C APIs to dynamically
-#    load the model, and entry points for other languages such as Python and
-#    Rust. TVM can also build a bundled deployment in which the runtime is
-#    combined with the model in a single package.
-#
-# The remainder of the tutorial will cover these aspects of TVM in more detail.
diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py
deleted file mode 100644
index fd7f5aa9d756..000000000000
--- a/tutorials/get_started/relay_quick_start.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _tutorial-relay-quick-start:
-
-Quick Start Tutorial for Compiling Deep Learning Models
-=======================================================
-**Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_
-
-This example shows how to build a neural network with Relay python frontend and
-generates a runtime library for Nvidia GPU with TVM.
-Notice that you need to build TVM with cuda and llvm enabled.
-"""
-
-######################################################################
-# Overview for Supported Hardware Backend of TVM
-# ----------------------------------------------
-# The image below shows hardware backend currently supported by TVM:
-#
-# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tvm_support_list.png
-#      :align: center
-#
-# In this tutorial, we'll choose cuda and llvm as target backends.
-# To begin with, let's import Relay and TVM.
-
-import numpy as np
-
-from tvm import relay
-from tvm.relay import testing
-import tvm
-from tvm import te
-from tvm.contrib import graph_executor
-import tvm.testing
-
-######################################################################
-# Define Neural Network in Relay
-# ------------------------------
-# First, let's define a neural network with relay python frontend.
-# For simplicity, we'll use pre-defined resnet-18 network in Relay.
-# Parameters are initialized with Xavier initializer.
-# Relay also supports other model formats such as MXNet, CoreML, ONNX and
-# Tensorflow.
-#
-# In this tutorial, we assume we will do inference on our device and
-# the batch size is set to be 1. Input images are RGB color images of
-# size 224 * 224. We can call the
-# :py:meth:`tvm.relay.expr.TupleWrapper.astext()` to show the network
-# structure.
-
-batch_size = 1
-num_class = 1000
-image_shape = (3, 224, 224)
-data_shape = (batch_size,) + image_shape
-out_shape = (batch_size, num_class)
-
-mod, params = relay.testing.resnet.get_workload(
-    num_layers=18, batch_size=batch_size, image_shape=image_shape
-)
-
-# set show_meta_data=True if you want to show meta data
-print(mod.astext(show_meta_data=False))
-
-######################################################################
-# Compilation
-# -----------
-# Next step is to compile the model using the Relay/TVM pipeline.
-# Users can specify the optimization level of the compilation.
-# Currently this value can be 0 to 3. The optimization passes include
-# operator fusion, pre-computation, layout transformation and so on.
-#
-# :py:func:`relay.build` returns three components: the execution graph in
-# json format, the TVM module library of compiled functions specifically
-# for this graph on the target hardware, and the parameter blobs of
-# the model. During the compilation, Relay does the graph-level
-# optimization while TVM does the tensor-level optimization, resulting
-# in an optimized runtime module for model serving.
-#
-# We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build`
-# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
-# then registers the operators (i.e. the nodes of the optimized graphs) to
-# TVM implementations to generate a `tvm.module`.
-# To generate the module library, TVM will first transfer the high level IR
-# into the lower intrinsic IR of the specified target backend, which is CUDA
-# in this example. Then the machine code will be generated as the module library.
-
-opt_level = 3
-target = tvm.target.cuda()
-with tvm.transform.PassContext(opt_level=opt_level):
-    lib = relay.build(mod, target, params=params)
-
-#####################################################################
-# Run the generate library
-# ------------------------
-# Now we can create graph executor and run the module on Nvidia GPU.
-
-# create random input
-dev = tvm.cuda()
-data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-# create module
-module = graph_executor.GraphModule(lib["default"](dev))
-# set input and parameters
-module.set_input("data", data)
-# run
-module.run()
-# get output
-out = module.get_output(0, tvm.nd.empty(out_shape)).numpy()
-
-# Print first 10 elements of output
-print(out.flatten()[0:10])
-
-######################################################################
-# Save and Load Compiled Module
-# -----------------------------
-# We can also save the graph, lib and parameters into files and load them
-# back in deploy environment.
-
-####################################################
-
-# save the graph, lib and params into separate files
-from tvm.contrib import utils
-
-temp = utils.tempdir()
-path_lib = temp.relpath("deploy_lib.tar")
-lib.export_library(path_lib)
-print(temp.listdir())
-
-####################################################
-
-# load the module back.
-loaded_lib = tvm.runtime.load_module(path_lib)
-input_data = tvm.nd.array(data)
-
-module = graph_executor.GraphModule(loaded_lib["default"](dev))
-module.run(data=input_data)
-out_deploy = module.get_output(0).numpy()
-
-# Print first 10 elements of output
-print(out_deploy.flatten()[0:10])
-
-# check whether the output from deployed module is consistent with original one
-tvm.testing.assert_allclose(out_deploy, out, atol=1e-5)
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py
deleted file mode 100644
index 310d6bdbfee4..000000000000
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ /dev/null
@@ -1,903 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _tutorial-tensor-expr-get-started:
-
-Working with Operators Using Tensor Expression
-==============================================
-**Author**: `Tianqi Chen <https://tqchen.github.io>`_
-
-In this tutorial we will turn our attention to how TVM works with Tensor
-Expression (TE) to define tensor computations and apply loop optimizations. TE
-describes tensor computations in a pure functional language (that is each
-expression has no side effects). When viewed in context of the TVM as a whole,
-Relay describes a computation as a set of operators, and each of these
-operators can be represented as a TE expression where each TE expression takes
-input tensors and produces an output tensor.
-
-This is an introductory tutorial to the Tensor Expression language in TVM. TVM
-uses a domain specific tensor expression for efficient kernel construction. We
-will demonstrate the basic workflow with two examples of using the tensor expression
-language. The first example introduces TE and scheduling with vector
-addition. The second expands on these concepts with a step-by-step optimization
-of a matrix multiplication with TE. This matrix multiplication example will
-serve as the comparative basis for future tutorials covering more advanced
-features of TVM.
-"""
-
-################################################################################
-# Example 1: Writing and Scheduling Vector Addition in TE for CPU
-# ---------------------------------------------------------------
-#
-# Let's look at an example in Python in which we will implement a TE for
-# vector addition, followed by a schedule targeted towards a CPU.
-# We begin by initializing a TVM environment.
-
-import tvm
-import tvm.testing
-from tvm import te
-import numpy as np
-
-# You will get better performance if you can identify the CPU you are targeting
-# and specify it. If you're using llvm, you can get this information from the
-# command ``llc --version`` to get the CPU type, and you can check
-# ``/proc/cpuinfo`` for additional extensions that your processor might
-# support. For example, you can use "llvm -mcpu=skylake-avx512" for CPUs with
-# AVX-512 instructions.
-
-tgt = tvm.target.Target(target="llvm", host="llvm")
-
-################################################################################
-# Describing the Vector Computation
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# We describe a vector addition computation. TVM adopts tensor semantics, with
-# each intermediate result represented as a multi-dimensional array. The user
-# needs to describe the computation rule that generates the tensors. We first
-# define a symbolic variable ``n`` to represent the shape. We then define two
-# placeholder Tensors, ``A`` and ``B``, with given shape ``(n,)``. We then
-# describe the result tensor ``C``, with a ``compute`` operation. The
-# ``compute`` defines a computation, with the output conforming to the
-# specified tensor shape and the computation to be performed at each position
-# in the tensor defined by the lambda function. Note that while ``n`` is a
-# variable, it defines a consistent shape between the ``A``, ``B`` and ``C``
-# tensors. Remember, no actual computation happens during this phase, as we
-# are only declaring how the computation should be done.
-
-n = te.var("n")
-A = te.placeholder((n,), name="A")
-B = te.placeholder((n,), name="B")
-C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-
-################################################################################
-# .. note:: Lambda Functions
-#
-#   The second argument to the ``te.compute`` method is the function that
-#   performs the computation. In this example, we're using an anonymous function,
-#   also known as a ``lambda`` function, to define the computation, in this case
-#   addition on the ``i``th element of ``A`` and ``B``.
-
-################################################################################
-# Create a Default Schedule for the Computation
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# While the above lines describe the computation rule, we can compute ``C`` in
-# many different ways to fit different devices. For a tensor with multiple
-# axes, you can choose which axis to iterate over first, or computations can be
-# split across different threads. TVM requires that the user to provide a
-# schedule, which is a description of how the computation should be performed.
-# Scheduling operations within TE can change loop orders, split computations
-# across different threads, group blocks of data together, amongst other
-# operations. An important concept behind schedules is that they only describe
-# how the computation is performed, so different schedules for the same TE will
-# produce the same result.
-#
-# TVM allows you to create a naive schedule that will compute ``C`` in by
-# iterating in row major order.
-#
-# .. code-block:: c
-#
-#   for (int i = 0; i < n; ++i) {
-#     C[i] = A[i] + B[i];
-#   }
-
-s = te.create_schedule(C.op)
-
-######################################################################
-# Compile and Evaluate the Default Schedule
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# With the TE expression and a schedule, we can produce runnable code for our
-# target language and architecture, in this case LLVM and a CPU. We provide
-# TVM with the schedule, a list of the TE expressions that are in the schedule,
-# the target and host, and the name of the function we are producing. The result
-# of the output is a type-erased function that can be called directly from Python.
-#
-# In the following line, we use tvm.build to create a function. The build
-# function takes the schedule, the desired signature of the function (including
-# the inputs and outputs) as well as target language we want to compile to.
-
-fadd = tvm.build(s, [A, B, C], tgt, name="myadd")
-
-################################################################################
-# Let's run the function, and compare the output to the same computation in
-# numpy. The compiled TVM function is exposes a concise C API that can be invoked
-# from any language. We begin by creating a device, which is a device (CPU in this
-# example) that TVM can compile the schedule to. In this case the device is an
-# LLVM CPU target. We can then initialize the tensors in our device and
-# perform the custom addition operation. To verify that the computation is
-# correct, we can compare the result of the output of the c tensor to the same
-# computation performed by numpy.
-
-dev = tvm.device(tgt.kind.name, 0)
-
-n = 1024
-a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-fadd(a, b, c)
-tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-################################################################################
-# To get a comparison of how fast this version is compared to numpy, create a
-# helper function to run a profile of the TVM generated code.
-import timeit
-
-np_repeat = 100
-np_running_time = timeit.timeit(
-    setup="import numpy\n"
-    "n = 32768\n"
-    'dtype = "float32"\n'
-    "a = numpy.random.rand(n, 1).astype(dtype)\n"
-    "b = numpy.random.rand(n, 1).astype(dtype)\n",
-    stmt="answer = a + b",
-    number=np_repeat,
-)
-print("Numpy running time: %f" % (np_running_time / np_repeat))
-
-
-def evaluate_addition(func, target, optimization, log):
-    dev = tvm.device(target.kind.name, 0)
-    n = 32768
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-
-    evaluator = func.time_evaluator(func.entry_name, dev, number=10)
-    mean_time = evaluator(a, b, c).mean
-    print("%s: %f" % (optimization, mean_time))
-
-    log.append((optimization, mean_time))
-
-
-log = [("numpy", np_running_time / np_repeat)]
-evaluate_addition(fadd, tgt, "naive", log=log)
-
-################################################################################
-# Updating the Schedule to Use Paralleism
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Now that we've illustrated the fundamentals of TE, let's go deeper into what
-# schedules do, and how they can be used to optimize tensor expressions for
-# different architectures. A schedule is a series of steps that are applied to
-# an expression to transform it in a number of different ways. When a schedule
-# is applied to an expression in TE, the inputs and outputs remain the same,
-# but when compiled the implementation of the expression can change. This
-# tensor addition, in the default schedule, is run serially but is easy to
-# parallelize across all of the processor threads. We can apply the parallel
-# schedule operation to our computation.
-
-s[C].parallel(C.op.axis[0])
-
-################################################################################
-# The ``tvm.lower`` command will generate the Intermediate Representation (IR)
-# of the TE, with the corresponding schedule. By lowering the expression as we
-# apply different schedule operations, we can see the effect of scheduling on
-# the ordering of the computation. We use the flag ``simple_mode=True`` to
-# return a readable C-style statement.
-
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# It's now possible for TVM to run these blocks on independent threads. Let's
-# compile and run this new schedule with the parallel operation applied:
-
-fadd_parallel = tvm.build(s, [A, B, C], tgt, name="myadd_parallel")
-fadd_parallel(a, b, c)
-
-tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-evaluate_addition(fadd_parallel, tgt, "parallel", log=log)
-
-################################################################################
-# Updating the Schedule to Use Vectorization
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Modern CPUs also have the ability to perform SIMD operations on floating
-# point values, and we can apply another schedule to our computation expression
-# to take advantage of this. Accomplishing this requires multiple steps: first
-# we have to split the schedule into inner and outer loops using the split
-# scheduling primitive. The inner loops can use vectorization to use SIMD
-# instructions using the vectorize scheduling primitive, then the outer loops
-# can be parallelized using the parallel scheduling primitive. Choose the split
-# factor to be the number of threads on your CPU.
-
-# Recreate the schedule, since we modified it with the parallel operation in
-# the previous example
-n = te.var("n")
-A = te.placeholder((n,), name="A")
-B = te.placeholder((n,), name="B")
-C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-
-s = te.create_schedule(C.op)
-
-# This factor should be chosen to match the number of threads appropriate for
-# your CPU. This will vary depending on architecture, but a good rule is
-# setting this factor to equal the number of available CPU cores.
-factor = 4
-
-outer, inner = s[C].split(C.op.axis[0], factor=factor)
-s[C].parallel(outer)
-s[C].vectorize(inner)
-
-fadd_vector = tvm.build(s, [A, B, C], tgt, name="myadd_parallel")
-
-evaluate_addition(fadd_vector, tgt, "vector", log=log)
-
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Comparing the Diferent Schedules
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# We can now compare the different schedules
-
-baseline = log[0][1]
-print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20)))
-for result in log:
-    print(
-        "%s\t%s\t%s"
-        % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20))
-    )
-
-
-################################################################################
-# .. note:: Code Specialization
-#
-#   As you may have noticed, the declarations of ``A``, ``B`` and ``C`` all
-#   take the same shape argument, ``n``. TVM will take advantage of this to
-#   pass only a single shape argument to the kernel, as you will find in the
-#   printed device code. This is one form of specialization.
-#
-#   On the host side, TVM will automatically generate check code that checks
-#   the constraints in the parameters. So if you pass arrays with different
-#   shapes into fadd, an error will be raised.
-#
-#   We can do more specializations. For example, we can write :code:`n =
-#   tvm.runtime.convert(1024)` instead of :code:`n = te.var("n")`, in the
-#   computation declaration. The generated function will only take vectors with
-#   length 1024.
-
-################################################################################
-# We've defined, scheduled, and compiled a vector addition operator, which we
-# were then able to execute on the TVM runtime. We can save the operator as a
-# library, which we can then load later using the TVM runtime.
-
-################################################################################
-# Targeting Vector Addition for GPUs (Optional)
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# TVM is capable of targeting multiple architectures. In the next example, we
-# will target compilation of the vector addition to GPUs.
-
-# If you want to run this code, change ``run_cuda = True``
-# Note that by default this example is not run in the docs CI.
-
-run_cuda = False
-if run_cuda:
-    # Change this target to the correct backend for you gpu. For example: cuda (NVIDIA GPUs),
-    # rocm (Radeon GPUS), OpenCL (opencl).
-    tgt_gpu = tvm.target.Target(target="cuda", host="llvm")
-
-    # Recreate the schedule
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    print(type(C))
-
-    s = te.create_schedule(C.op)
-
-    bx, tx = s[C].split(C.op.axis[0], factor=64)
-
-    ################################################################################
-    # Finally we must bind the iteration axis bx and tx to threads in the GPU
-    # compute grid. The naive schedule is not valid for GPUs, and these are
-    # specific constructs that allow us to generate code that runs on a GPU.
-
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-
-    ######################################################################
-    # Compilation
-    # -----------
-    # After we have finished specifying the schedule, we can compile it
-    # into a TVM function. By default TVM compiles into a type-erased
-    # function that can be directly called from the python side.
-    #
-    # In the following line, we use tvm.build to create a function.
-    # The build function takes the schedule, the desired signature of the
-    # function (including the inputs and outputs) as well as target language
-    # we want to compile to.
-    #
-    # The result of compilation fadd is a GPU device function (if GPU is
-    # involved) as well as a host wrapper that calls into the GPU
-    # function. fadd is the generated host wrapper function, it contains
-    # a reference to the generated device function internally.
-
-    fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name="myadd")
-
-    ################################################################################
-    # The compiled TVM function is exposes a concise C API that can be invoked from
-    # any language.
-    #
-    # We provide a minimal array API in python to aid quick testing and prototyping.
-    # The array API is based on the `DLPack <https://github.com/dmlc/dlpack>`_ standard.
-    #
-    # - We first create a GPU device.
-    # - Then tvm.nd.array copies the data to the GPU.
-    # - ``fadd`` runs the actual computation
-    # - ``numpy()`` copies the GPU array back to the CPU (so we can verify correctness).
-    #
-    # Note that copying the data to and from the memory on the GPU is a required step.
-
-    dev = tvm.device(tgt_gpu.kind.name, 0)
-
-    n = 1024
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-    fadd(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    ################################################################################
-    # Inspect the Generated GPU Code
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # You can inspect the generated code in TVM. The result of tvm.build is a TVM
-    # Module. fadd is the host module that contains the host wrapper, it also
-    # contains a device module for the CUDA (GPU) function.
-    #
-    # The following code fetches the device module and prints the content code.
-
-    if (
-        tgt_gpu.kind.name == "cuda"
-        or tgt_gpu.kind.name == "rocm"
-        or tgt_gpu.kind.name.startswith("opencl")
-    ):
-        dev_module = fadd.imported_modules[0]
-        print("-----GPU code-----")
-        print(dev_module.get_source())
-    else:
-        print(fadd.get_source())
-
-################################################################################
-# Saving and Loading Compiled Modules
-# -----------------------------------
-# Besides runtime compilation, we can save the compiled modules into a file and
-# load them back later.
-#
-# The following code first performs the following steps:
-#
-# - It saves the compiled host module into an object file.
-# - Then it saves the device module into a ptx file.
-# - cc.create_shared calls a compiler (gcc) to create a shared library
-
-from tvm.contrib import cc
-from tvm.contrib import utils
-
-temp = utils.tempdir()
-fadd.save(temp.relpath("myadd.o"))
-if tgt.kind.name == "cuda":
-    fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
-if tgt.kind.name == "rocm":
-    fadd.imported_modules[0].save(temp.relpath("myadd.hsaco"))
-if tgt.kind.name.startswith("opencl"):
-    fadd.imported_modules[0].save(temp.relpath("myadd.cl"))
-cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
-print(temp.listdir())
-
-################################################################################
-# .. note:: Module Storage Format
-#
-#   The CPU (host) module is directly saved as a shared library (.so). There
-#   can be multiple customized formats of the device code. In our example, the
-#   device code is stored in ptx, as well as a meta data json file. They can be
-#   loaded and linked separately via import.
-
-################################################################################
-# Load Compiled Module
-# ~~~~~~~~~~~~~~~~~~~~
-# We can load the compiled module from the file system and run the code. The
-# following code loads the host and device module separately and links them
-# together. We can verify that the newly loaded function works.
-
-fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so"))
-if tgt.kind.name == "cuda":
-    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx"))
-    fadd1.import_module(fadd1_dev)
-
-if tgt.kind.name == "rocm":
-    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco"))
-    fadd1.import_module(fadd1_dev)
-
-if tgt.kind.name.startswith("opencl"):
-    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl"))
-    fadd1.import_module(fadd1_dev)
-
-fadd1(a, b, c)
-tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-################################################################################
-# Pack Everything into One Library
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# In the above example, we store the device and host code separately. TVM also
-# supports export everything as one shared library. Under the hood, we pack
-# the device modules into binary blobs and link them together with the host
-# code. Currently we support packing of Metal, OpenCL and CUDA modules.
-
-fadd.export_library(temp.relpath("myadd_pack.so"))
-fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so"))
-fadd2(a, b, c)
-tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-################################################################################
-# .. note:: Runtime API and Thread-Safety
-#
-#   The compiled modules of TVM do not depend on the TVM compiler. Instead,
-#   they only depend on a minimum runtime library. The TVM runtime library
-#   wraps the device drivers and provides thread-safe and device agnostic calls
-#   into the compiled functions.
-#
-#   This means that you can call the compiled TVM functions from any thread, on
-#   any GPUs, provided that you have compiled the code for that GPU.
-
-################################################################################
-# Generate OpenCL Code
-# --------------------
-# TVM provides code generation features into multiple backends. We can also
-# generate OpenCL code or LLVM code that runs on CPU backends.
-#
-# The following code blocks generate OpenCL code, creates array on an OpenCL
-# device, and verifies the correctness of the code.
-
-if tgt.kind.name.startswith("opencl"):
-    fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd")
-    print("------opencl code------")
-    print(fadd_cl.imported_modules[0].get_source())
-    dev = tvm.cl(0)
-    n = 1024
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-    fadd_cl(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-################################################################################
-# .. note:: TE Scheduling Primitives
-#
-#   TVM includes a number of different scheduling primitives:
-#
-#   - split: splits a specified axis into two axises by the defined factor.
-#   - tile: tiles will split a computation across two axes by the defined factors.
-#   - fuse: fuses two consecutive axises of one computation.
-#   - reorder: can reorder the axises of a computation into a defined order.
-#   - bind: can bind a computation to a specific thread, useful in GPU programming.
-#   - compute_at: by default, TVM will compute tensors at the outermost level
-#     of the function, or the root, by default. compute_at specifies that one
-#     tensor should be computed at the first axis of computation for another
-#     operator.
-#   - compute_inline: when marked inline, a computation will be expanded then
-#     inserted into the address where the tensor is required.
-#   - compute_root: moves a computation to the outermost layer, or root, of the
-#     function. This means that stage of the computation will be fully computed
-#     before it moves on to the next stage.
-#
-#   A complete description of these primitives can be found in the
-#   [Schedule Primitives](https://tvm.apache.org/docs/tutorials/language/schedule_primitives.html) docs page.
-
-################################################################################
-# Example 2: Manually Optimizing Matrix Multiplication with TE
-# ------------------------------------------------------------
-#
-# Now we will consider a second, more advanced example, demonstrating how with
-# just 18 lines of python code TVM speeds up a common matrix multiplication operation by 18x.
-#
-# **Matrix multiplication is a compute intensive operation. There are
-# two important optimizations for good CPU performance:**
-#
-# 1. Increase the cache hit rate of memory access. Both complex
-#    numerical computation and hot-spot memory access can be
-#    accelerated by a high cache hit rate. This requires us to
-#    transform the origin memory access pattern to a pattern that fits
-#    the cache policy.
-#
-# 2. SIMD (Single instruction multi-data), also known as the vector
-#    processing unit. On each cycle instead of processing a single
-#    value, SIMD can process a small batch of data.  This requires us
-#    to transform the data access pattern in the loop body in uniform
-#    pattern so that the LLVM backend can lower it to SIMD.
-#
-# The techniques used in this tutorial are a subset of tricks mentioned in this
-# `repository <https://github.com/flame/how-to-optimize-gemm>`_. Some of them
-# have been applied by TVM abstraction automatically, but some of them cannot
-# be automatically applied due to TVM constraints.
-
-################################################################################
-# Preparation and Performance Baseline
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# We begin by collecting performance data on the `numpy` implementation of
-# matrix multiplication.
-
-import tvm
-import tvm.testing
-from tvm import te
-import numpy
-
-# The size of the matrix
-# (M, K) x (K, N)
-# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.
-M = 1024
-K = 1024
-N = 1024
-
-# The default tensor data type in tvm
-dtype = "float32"
-
-# You will want to adjust the target to match any CPU vector extensions you
-# might have. For example, if you're using using Intel AVX2 (Advanced Vector
-# Extensions) ISA for SIMD, you can get the best performance by changing the
-# following line to ``llvm -mcpu=core-avx2``, or specific type of CPU you use.
-# Recall that you're using llvm, you can get this information from the command
-# ``llc --version`` to get the CPU type, and you can check ``/proc/cpuinfo``
-# for additional extensions that your processor might support.
-
-target = tvm.target.Target(target="llvm", host="llvm")
-dev = tvm.device(target.kind.name, 0)
-
-# Random generated tensor for testing
-a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev)
-b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev)
-
-# Repeatedly perform a matrix multiplication to get a performance baseline
-# for the default numpy implementation
-np_repeat = 100
-np_running_time = timeit.timeit(
-    setup="import numpy\n"
-    "M = " + str(M) + "\n"
-    "K = " + str(K) + "\n"
-    "N = " + str(N) + "\n"
-    'dtype = "float32"\n'
-    "a = numpy.random.rand(M, K).astype(dtype)\n"
-    "b = numpy.random.rand(K, N).astype(dtype)\n",
-    stmt="answer = numpy.dot(a, b)",
-    number=np_repeat,
-)
-print("Numpy running time: %f" % (np_running_time / np_repeat))
-
-answer = numpy.dot(a.numpy(), b.numpy())
-
-################################################################################
-# Now we write a basic matrix multiplication using TVM TE and verify that it
-# produces the same results as the numpy implementation. We also write a
-# function that will help us measure the performance of the schedule
-# optimizations.
-
-# TVM Matrix Multiplication using TE
-k = te.reduce_axis((0, K), "k")
-A = te.placeholder((M, K), name="A")
-B = te.placeholder((K, N), name="B")
-C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")
-
-# Default schedule
-s = te.create_schedule(C.op)
-func = tvm.build(s, [A, B, C], target=target, name="mmult")
-
-c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
-func(a, b, c)
-tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5)
-
-
-def evaluate_operation(s, vars, target, name, optimization, log):
-    func = tvm.build(s, [A, B, C], target=target, name="mmult")
-    assert func
-
-    c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
-    func(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5)
-
-    evaluator = func.time_evaluator(func.entry_name, dev, number=10)
-    mean_time = evaluator(a, b, c).mean
-    print("%s: %f" % (optimization, mean_time))
-    log.append((optimization, mean_time))
-
-
-log = []
-
-evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="none", log=log)
-
-################################################################################
-# Let's take a look at the intermediate representation of the operator and
-# default schedule using the TVM lower function. Note how the implementation is
-# essentially a naive implementation of a matrix multiplication, using three
-# nested loops over the indices of the A and B matrices.
-
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Optimization 1: Blocking
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# A important trick to enhance the cache hit rate is blocking, where you
-# structure memory access such that the inside a block is a small neighborhood
-# that has high memory locality. In this tutorial, we pick a block factor of
-# 32. This will result in a block that will fill a 32 * 32 * sizeof(float) area
-# of memory. This corresponds to a cache size of 4KB, in relation to a
-# reference cache size of 32 KB for L1 cache.
-#
-# We begin by creating a default schedule for the ``C`` operation, then apply a
-# ``tile`` scheduling primitive to it with the specified block factor, with the
-# scheduling primitive returning the resulting loop order from outermost to
-# innermost, as a vector ``[x_outer, y_outer, x_inner, y_inner]``. We then get
-# the reduction axis for output of the operation, and perform a split operation
-# on it using a factor of 4. This factor doesn't directly impact the blocking
-# optimization we're working on right now, but will be useful later when we
-# apply vectorization.
-#
-# Now that the operation has been blocked, we can reorder the computation to
-# put the reduction operation into the outermost loop of the computation,
-# helping to guarantee that the blocked data remains in cache. This completes
-# the schedule, and we can build and test the performance compared to the naive
-# schedule.
-
-bn = 32
-
-# Blocking by loop tiling
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-(k,) = s[C].op.reduce_axis
-ko, ki = s[C].split(k, factor=4)
-
-# Hoist reduction domain outside the blocking loop
-s[C].reorder(xo, yo, ko, ki, xi, yi)
-
-evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="blocking", log=log)
-
-################################################################################
-# By reordering the computation to take advantage of caching, you should see a
-# significant improvement in the performance of the computation. Now, print the
-# internal representation and compare it to the original:
-
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Optimization 2: Vectorization
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Another important optimization trick is vectorization. When the memory access
-# pattern is uniform, the compiler can detect this pattern and pass the
-# continuous memory to the SIMD vector processor. In TVM, we can use the
-# ``vectorize`` interface to hint the compiler this pattern, taking advantage
-# of this hardware feature.
-#
-# In this tutorial, we chose to vectorize the inner loop row data since it is
-# already cache friendly from our previous optimizations.
-
-# Apply the vectorization optimization
-s[C].vectorize(yi)
-
-evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="vectorization", log=log)
-
-# The generalized IR after vectorization
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Optimization 3: Loop Permutation
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# If we look at the above IR, we can see the inner loop row data is vectorized
-# and B is transformed into PackedB (this is evident by the `(float32x32*)B2`
-# portion of the inner loop). The traversal of PackedB is sequential now. So we
-# will look at the access pattern of A. In current schedule, A is accessed
-# column by column which is not cache friendly. If we change the nested loop
-# order of `ki` and inner axes `xi`, the access pattern for A matrix will be
-# more cache friendly.
-
-s = te.create_schedule(C.op)
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-(k,) = s[C].op.reduce_axis
-ko, ki = s[C].split(k, factor=4)
-
-# re-ordering
-s[C].reorder(xo, yo, ko, xi, ki, yi)
-s[C].vectorize(yi)
-
-evaluate_operation(
-    s, [A, B, C], target=target, name="mmult", optimization="loop permutation", log=log
-)
-
-# Again, print the new generalized IR
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Optimization 4: Array Packing
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Another important trick is array packing. This trick is to reorder the
-# storage dimension of the array to convert the continuous access pattern on
-# certain dimension to a sequential pattern after flattening.
-#
-# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png
-#    :align: center
-#
-# Just as it is shown in the figure above, after blocking the computations, we
-# can observe the array access pattern of B (after flattening), which is
-# regular but discontinuous. We expect that after some transformation we can
-# get a continuous access pattern. By reordering a ``[16][16]`` array to a
-# ``[16/4][16][4]`` array the access pattern of B will be sequential when
-# grabing the corresponding value from the packed array.
-#
-# To accomplish this, we are going to have to start with a new default
-# schedule, taking into account the new packing of B. It's worth taking a
-# moment to comment on this: TE is a powerful and expressive language for
-# writing optimized operators, but it often requires some knowledge of the
-# underlying algorithm, data structures, and hardware target that you are
-# writing for. Later in the tutorial, we will discuss some of the options for
-# letting TVM take that burden. Regardless, let's move on with the new
-# optimized schedule.
-
-# We have to re-write the algorithm slightly.
-packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB")
-C = te.compute(
-    (M, N),
-    lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
-    name="C",
-)
-
-s = te.create_schedule(C.op)
-
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-(k,) = s[C].op.reduce_axis
-ko, ki = s[C].split(k, factor=4)
-
-s[C].reorder(xo, yo, ko, xi, ki, yi)
-s[C].vectorize(yi)
-
-x, y, z = s[packedB].op.axis
-s[packedB].vectorize(z)
-s[packedB].parallel(x)
-
-evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="array packing", log=log)
-
-# Here is the generated IR after array packing.
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Optimization 5: Optimizing Block Writing Through Caching
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Up to this point all of our optimizations have focused on efficiently
-# accessing and computing the data from the `A` and `B` matrices to compute the
-# `C` matrix. After the blocking optimization, the operator will write result
-# to `C` block by block, and the access pattern is not sequential. We can
-# address this by using a sequential cache array, using a combination of
-# `cache_write`, `compute_at`, and `unroll`to hold the block results and write
-# to `C` when all the block results are ready.
-
-s = te.create_schedule(C.op)
-
-# Allocate write cache
-CC = s.cache_write(C, "global")
-
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-
-# Write cache is computed at yo
-s[CC].compute_at(s[C], yo)
-
-# New inner axes
-xc, yc = s[CC].op.axis
-
-(k,) = s[CC].op.reduce_axis
-ko, ki = s[CC].split(k, factor=4)
-s[CC].reorder(ko, xc, ki, yc)
-s[CC].unroll(ki)
-s[CC].vectorize(yc)
-
-x, y, z = s[packedB].op.axis
-s[packedB].vectorize(z)
-s[packedB].parallel(x)
-
-evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="block caching", log=log)
-
-# Here is the generated IR after write cache blocking.
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Optimization 6: Parallelization
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# So far, our computation is only designed to use a single core. Nearly all
-# modern processors have multiple cores, and computation can benefit from
-# running computations in parallel. The final optimization is to take advantage
-# of thread-level parallelization.
-
-# parallel
-s[C].parallel(xo)
-
-x, y, z = s[packedB].op.axis
-s[packedB].vectorize(z)
-s[packedB].parallel(x)
-
-evaluate_operation(
-    s, [A, B, C], target=target, name="mmult", optimization="parallelization", log=log
-)
-
-# Here is the generated IR after parallelization.
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-################################################################################
-# Summary of Matrix Multiplication Example
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# After applying the above simple optimizations with only 18 lines of code, our
-# generated code can begin to approach the performance of `numpy` with the Math
-# Kernel Library (MKL). Since we've been logging the performance as we've been
-# working, we can compare the results.
-
-baseline = log[0][1]
-print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20)))
-for result in log:
-    print(
-        "%s\t%s\t%s"
-        % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20))
-    )
-
-################################################################################
-# Note that the outputs on the web page reflect the running times on a
-# non-exclusive Docker container, and should be considered unreliable. It is
-# highly encouraged to run the tutorial by yourself to observe the performance
-# gain achieved by TVM, and to carefully work through each example to
-# understand the iterative improvements that are made to the matrix
-# multiplication operation.
-
-################################################################################
-# Final Notes and Summary
-# -----------------------
-# As mentioned earlier, how to apply optimizations using TE and scheduling
-# primitives can require some knowledge of the underlying architecture and
-# algorithms. However, TE was designed to act as a foundation for more complex
-# algorithms that can search the potential optimization. With the knowledge you
-# have from this introduction to TE, we can now begin to explore how TVM can
-# automate the schedule optimization process.
-#
-# This tutorial provided a walkthrough of TVM Tensor Expresstion (TE) workflow
-# using a vector add and a matrix multiplication examples. The general workflow
-# is
-#
-# - Describe your computation via a series of operations.
-# - Describe how we want to compute use schedule primitives.
-# - Compile to the target function we want.
-# - Optionally, save the function to be loaded later.
-#
-# Upcoming tutorials expand on the matrix multiplication example, and show how
-# you can build generic templates of the matrix multiplication and other
-# operations with tunable parameters that allows you to automatically optimize
-# the computation for specific platforms.
diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py
deleted file mode 100644
index c729b86a3245..000000000000
--- a/tutorials/get_started/tvmc_command_line_driver.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Compiling and Optimizing a Model with TVMC
-==========================================
-**Authors**:
-`Leandro Nunes <https://github.com/leandron>`_,
-`Matthew Barrett <https://github.com/mbaret>`_,
-`Chris Hoge <https://github.com/hogepodge>`_
-
-In this section, we will work with TVMC, the TVM command line driver. TVMC is a
-tool that exposes TVM features such as auto-tuning, compiling, profiling and
-execution of models through a command line interface.
-
-Upon completion of this section, we will have used TVMC to accomplish the
-following tasks:
-
-* Compile a pre-trained ResNet 50 v2 model for the TVM runtime.
-* Run a real image through the compiled model, and interpret the output and
-  model performance.
-* Tune the model on a CPU using TVM.
-* Re-compile an optimized model using the tuning data collected by TVM.
-* Run the image through the optimized model, and compare the output and model
-  performance.
-
-The goal of this section is to give you an overview of TVM and TVMC's
-capabilities, and set the stage for understanding how TVM works.
-"""
-
-################################################################################
-# Using TVMC
-# ----------
-#
-# TVMC is a Python application, part of the TVM Python package.
-# When you install TVM using a Python package, you will get TVMC as
-# as a command line application called ``tvmc``. The location of this command
-# will vary depending on your platform and installation method.
-#
-# Alternatively, if you have TVM as a Python module on your
-# ``$PYTHONPATH``,you can access the command line driver functionality
-# via the executable python module, ``python -m tvm.driver.tvmc``.
-#
-# For simplicity, this tutorial will mention TVMC command line using
-# ``tvmc <options>``, but the same results can be obtained with
-# ``python -m tvm.driver.tvmc <options>``.
-#
-# You can check the help page using:
-#
-# .. code-block:: bash
-#
-#   tvmc --help
-#
-# The main features of TVM available to ``tvmc`` are from subcommands
-# ``compile``, and ``run``, and ``tune``.  To read about specific options under
-# a given subcommand, use ``tvmc <subcommand> --help``. We will cover each of
-# these commands in this tutorial, but first we need to download a pre-trained
-# model to work with.
-#
-
-
-################################################################################
-# Obtaining the Model
-# -------------------
-#
-# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a
-# convolutional neural network that is 50-layers deep and designed to classify
-# images. The model we will be using has been pre-trained on more than a
-# million images with 1000 different classifications. The network has an input
-# image size of 224x224. If you are interested exploring more of how the
-# ResNet-50 model is structured, we recommend downloading `Netron
-# <https://netron.app>`, a freely available ML model viewer.
-#
-# For this tutorial we will be using the model in ONNX format.
-#
-# .. code-block:: bash
-#
-#   wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx
-#
-
-
-################################################################################
-# .. note:: Supported model formats
-#
-#   TVMC supports models created with Keras, ONNX, TensorFlow, TFLite
-#   and Torch. Use the option``--model-format`` if you need to
-#   explicitly provide the model format you are using. See ``tvmc
-#   compile --help`` for more information.
-#
-
-################################################################################
-# .. note:: Adding ONNX Support to TVM
-#
-#    TVM relies on the ONNX python library being available on your system. You
-#    can install ONNX using the command ``pip3 install --user onnx``. You may
-#    remove the ``--user`` option if you have root access and want to install
-#    ONNX globally.
-#
-
-################################################################################
-# Compiling an ONNX Model to the TVM Runtime
-# ------------------------------------------
-#
-# Once we've downloaded the ResNet-50 model, the next step is to compile it. To
-# accomplish that, we are going to use ``tvmc compile``. The output we get from
-# the compilation process is a TAR package of the model compiled to a dynamic
-# library for our target platform. We can run that model on our target device
-# using the TVM runtime.
-#
-# .. code-block:: bash
-#
-#   tvmc compile \
-#   --target "llvm" \
-#   --output resnet50-v2-7-tvm.tar \
-#   resnet50-v2-7.onnx
-#
-# Let's take a look at the files that ``tvmc compile`` creates in the module:
-#
-# .. code-block:: bash
-#
-# 	mkdir model
-# 	tar -xvf resnet50-v2-7-tvm.tar -C model
-# 	ls model
-#
-# You will see three files listed.
-#
-# * ``mod.so`` is the model, represented as a C++ library, that can be loaded
-#   by the TVM runtime.
-# * ``mod.json`` is a text representation of the TVM Relay computation graph.
-# * ``mod.params`` is a file containing the parameters for the pre-trained
-#   model.
-#
-# This module can be directly loaded by your application, and the model can be
-# run via the TVM runtime APIs.
-
-
-################################################################################
-# .. note:: Defining the Correct Target
-#
-#   Specifying the correct target (option ``--target``) can have a huge
-#   impact on the performance of the compiled module, as it can take
-#   advantage of hardware features available on the target. For more
-#   information, please refer to `Auto-tuning a convolutional network
-#   for x86 CPU <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
-#   We recommend identifying which CPU you are running, along with optional features,
-#   and set the target appropriately.
-#
-
-################################################################################
-# Running the Model from The Compiled Module with TVMC
-# ----------------------------------------------------
-#
-# Now that we've compiled the model to this module, we can use the TVM runtime
-# to make predictions with it. TVMC has the TVM runtime built in to it,
-# allowing you to run compiled TVM models. To use TVMC to run the model and
-# make predictions, we need two things:
-#
-# - The compiled module, which we just produced.
-# - Valid input to the model to make predictions on.
-#
-# Each model is particular when it comes to expected tensor shapes, formats and
-# data types. For this reason, most models require some pre and
-# post-processing, to ensure the input is valid and to interpret the output.
-# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This
-# is a well-supported NumPy format to serialize multiple arrays into a file
-#
-# As input for this tutorial, we will use the image of a cat, but you can feel
-# free to substitute image for any of your choosing.
-#
-# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg
-#    :height: 224px
-#    :width: 224px
-#    :align: center
-
-
-################################################################################
-# Input pre-processing
-# ~~~~~~~~~~~~~~~~~~~~
-#
-# For our ResNet 50 V2 model, the input is expected to be in ImageNet format.
-# Here is an example of a script to pre-process an image for ResNet 50 V2.
-#
-# You will need to have a supported version of the Python Image Library
-# installed. You can use ``pip3 install --user pillow`` to satisfy this
-# requirement for the script.
-#
-# .. code-block:: python
-#    :caption: preprocess.py
-#    :name: preprocess.py
-#
-#     #!python ./preprocess.py
-#     from tvm.contrib.download import download_testdata
-#     from PIL import Image
-#     import numpy as np
-#
-#     img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
-#     img_path = download_testdata(img_url, "imagenet_cat.png", module="data")
-#
-#     # Resize it to 224x224
-#     resized_image = Image.open(img_path).resize((224, 224))
-#     img_data = np.asarray(resized_image).astype("float32")
-#
-#     # ONNX expects NCHW input, so convert the array
-#     img_data = np.transpose(img_data, (2, 0, 1))
-#
-#     # Normalize according to ImageNet
-#     imagenet_mean = np.array([0.485, 0.456, 0.406])
-#     imagenet_stddev = np.array([0.229, 0.224, 0.225])
-#     norm_img_data = np.zeros(img_data.shape).astype("float32")
-#     for i in range(img_data.shape[0]):
-#    	    norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i]
-#
-#     # Add batch dimension
-#     img_data = np.expand_dims(norm_img_data, axis=0)
-#
-#     # Save to .npz (outputs imagenet_cat.npz)
-#     np.savez("imagenet_cat", data=img_data)
-#
-
-################################################################################
-# Running the Compiled Module
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# With both the model and input data in hand, we can now run TVMC to make a
-# prediction:
-#
-# .. code-block:: bash
-#
-#     tvmc run \
-#     --inputs imagenet_cat.npz \
-#     --output predictions.npz \
-#     resnet50-v2-7-tvm.tar
-#
-# Recall that the `.tar` model file includes a C++ library, a description of
-# the Relay model, and the parameters for the model. TVMC includes the TVM
-# runtime, which can load the model and make predictions against input. When
-# running the above command, TVMC outputs a new file, ``predictions.npz``, that
-# contains the model output tensors in NumPy format.
-#
-# In this example, we are running the model on the same machine that we used
-# for compilation. In some cases we might want to run it remotely via an RPC
-# Tracker. To read more about these options please check ``tvmc run --help``.
-
-################################################################################
-# Output Post-Processing
-# ~~~~~~~~~~~~~~~~~~~~~~
-#
-# As previously mentioned, each model will have its own particular way of
-# providing output tensors.
-#
-# In our case, we need to run some post-processing to render the outputs from
-# ResNet 50 V2 into a more human-readable form, using the lookup-table provided
-# for the model.
-#
-# The script below shows an example of the post-processing to extract labels
-# from the output of our compiled module.
-#
-# .. code-block:: python
-#     :caption: postprocess.py
-#     :name: postprocess.py
-#
-#     #!python ./postprocess.py
-#     import os.path
-#     import numpy as np
-#
-#     from scipy.special import softmax
-#
-#     from tvm.contrib.download import download_testdata
-#
-#     # Download a list of labels
-#     labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"
-#     labels_path = download_testdata(labels_url, "synset.txt", module="data")
-#
-#     with open(labels_path, "r") as f:
-#         labels = [l.rstrip() for l in f]
-#
-#     output_file = "predictions.npz"
-#
-#     # Open the output and read the output tensor
-#     if os.path.exists(output_file):
-#         with np.load(output_file) as data:
-#             scores = softmax(data["output_0"])
-#             scores = np.squeeze(scores)
-#             ranks = np.argsort(scores)[::-1]
-#
-#             for rank in ranks[0:5]:
-#                 print("class='%s' with probability=%f" % (labels[rank], scores[rank]))
-#
-# Running this script should produce the following output:
-#
-# .. code-block:: bash
-#
-#     python postprocess.py
-#
-#     # class='n02123045 tabby, tabby cat' with probability=0.610553
-#     # class='n02123159 tiger cat' with probability=0.367179
-#     # class='n02124075 Egyptian cat' with probability=0.019365
-#     # class='n02129604 tiger, Panthera tigris' with probability=0.001273
-#     # class='n04040759 radiator' with probability=0.000261
-#
-# Try replacing the cat image with other images, and see what sort of
-# predictions the ResNet model makes.
-
-################################################################################
-# Automatically Tuning the ResNet Model
-# -------------------------------------
-#
-# The previous model was compiled to work on the TVM runtime, but did not
-# include any platform specific optimization. In this section, we will show you
-# how to build an optimized model using TVMC to target your working platform.
-#
-# In some cases, we might not get the expected performance when running
-# inferences using our compiled module.  In cases like this, we can make use of
-# the auto-tuner, to find a better configuration for our model and get a boost
-# in performance. Tuning in TVM refers to the process by which a model is
-# optimized to run faster on a given target. This differs from training or
-# fine-tuning in that it does not affect the accuracy of the model, but only
-# the runtime performance. As part of the tuning process, TVM will try running
-# many different operator implementation variants to see which perform best.
-# The results of these runs are stored in a tuning records file, which is
-# ultimately the output of the ``tune`` subcommand.
-#
-# In the simplest form, tuning requires you to provide three things:
-#
-# - the target specification of the device you intend to run this model on
-# - the path to an output file in which the tuning records will be stored, and
-#   finally
-# - a path to the model to be tuned.
-#
-# The example below demonstrates how that works in practice:
-#
-# .. code-block:: bash
-#
-#     tvmc tune \
-#     --target "llvm" \
-#     --output resnet50-v2-7-autotuner_records.json \
-#     resnet50-v2-7.onnx
-#
-# In this example, you will see better results if you indicate a more specific
-# target for the `--target` flag.  For example, on an Intel i7 processor you
-# could use `--target llvm -mcpu=skylake`. For this tuning example, we are
-# tuning locally on the CPU using LLVM as the compiler for the specified
-# achitecture.
-#
-# TVMC will perform a search against the parameter space for the model, trying
-# out different configurations for operators and choosing the one that runs
-# fastest on your platform. Although this is a guided search based on the CPU
-# and model operations, it can still take several hours to complete the search.
-# The output of this search will be saved to the
-# `resnet50-v2-7-autotuner_records.json` file, which will later be used to
-# compile an optimized model.
-#
-# .. note:: Defining the Tuning Search Algorithm
-#
-#   By default this search is guided using an `XGBoost Grid` algorithm.
-#   Depending on your model complexity and amount of time avilable, you might
-#   want to choose a different algorithm. A full list is available by
-#   consulting ``tvmc tune --help``.
-#
-# The output will look something like this for a consumer-level Skylake CPU:
-#
-# .. code-block:: bash
-#
-#   tvmc tune   --target "llvm -mcpu=broadwell"   --output resnet50-v2-7-autotuner_records.json   resnet50-v2-7.onnx
-#   # [Task  1/24]  Current/Best:    9.65/  23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done.
-#   # [Task  1/24]  Current/Best:    3.56/  23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done.
-#   # [Task  2/24]  Current/Best:   13.13/  58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done.
-#   # [Task  3/24]  Current/Best:   31.93/  59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done.
-#   # [Task  4/24]  Current/Best:   16.42/  57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done.
-#   # [Task  5/24]  Current/Best:   12.42/  57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done.
-#   # [Task  6/24]  Current/Best:   20.66/  59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done.
-#   # [Task  7/24]  Current/Best:   15.48/  59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done.
-#   # [Task  8/24]  Current/Best:   31.97/  59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done.
-#   # [Task  9/24]  Current/Best:   34.14/  60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done.
-#   # [Task 10/24]  Current/Best:   12.53/  58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done.
-#   # [Task 11/24]  Current/Best:   30.94/  58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done.
-#   # [Task 12/24]  Current/Best:   23.66/  58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done.
-#   # [Task 13/24]  Current/Best:   25.44/  59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done.
-#   # [Task 14/24]  Current/Best:   26.83/  58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done.
-#   # [Task 15/24]  Current/Best:   33.64/  58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done.
-#   # [Task 16/24]  Current/Best:   14.93/  57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done.
-#   # [Task 17/24]  Current/Best:   28.70/  58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done.
-#   # [Task 18/24]  Current/Best:   19.01/  60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done.
-#   # [Task 19/24]  Current/Best:   14.61/  57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done.
-#   # [Task 20/24]  Current/Best:   10.47/  57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done.
-#   # [Task 21/24]  Current/Best:   34.37/  58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done.
-#   # [Task 22/24]  Current/Best:   15.75/  57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done.
-#   # [Task 23/24]  Current/Best:   23.23/  58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done.
-#   # [Task 24/24]  Current/Best:   17.27/  55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done.
-#
-# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning
-# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning
-# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information.
-#
-
-################################################################################
-# Compiling an Optimized Model with Tuning Data
-# ----------------------------------------------
-#
-# As an output of the tuning process above, we obtained the tuning records
-# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in
-# two ways:
-#
-# - As input to further tuning (via ``tvmc tune --tuning-records``).
-# - As input to the compiler
-#
-# The compiler will use the results to generate high performance code for the
-# model on your specified target. To do that we can use ``tvmc compile
-# --tuning-records``. Check ``tvmc compile --help`` for more information.
-#
-# Now that tuning data for the model has been collected, we can re-compile the
-# model using optimized operators to speed up our computations.
-#
-# .. code-block:: bash
-#
-#   tvmc compile \
-#   --target "llvm" \
-#   --tuning-records resnet50-v2-7-autotuner_records.json  \
-#   --output resnet50-v2-7-tvm_autotuned.tar \
-#   resnet50-v2-7.onnx
-#
-# Verify that the optimized model runs and produces the same results:
-#
-# .. code-block:: bash
-#
-#   tvmc run \
-#   --inputs imagenet_cat.npz \
-#   --output predictions.npz \
-#   resnet50-v2-7-tvm_autotuned.tar
-#
-#   python postprocess.py
-#
-# Verifying that the predictions are the same:
-#
-# .. code-block:: bash
-#
-#   # class='n02123045 tabby, tabby cat' with probability=0.610550
-#   # class='n02123159 tiger cat' with probability=0.367181
-#   # class='n02124075 Egyptian cat' with probability=0.019365
-#   # class='n02129604 tiger, Panthera tigris' with probability=0.001273
-#   # class='n04040759 radiator' with probability=0.000261
-
-################################################################################
-# Comparing the Tuned and Untuned Models
-# --------------------------------------
-#
-# TVMC gives you tools for basic performance benchmarking between the models.
-# You can specify a number of repetitions and that TVMC report on the model run
-# time (independent of runtime startup). We can get a rough idea of how much
-# tuning has improved the model performance. For example, on a test Intel i7
-# system, we see that the tuned model runs 47% faster than the untuned model:
-#
-# .. code-block:: bash
-#
-#   tvmc run \
-#   --inputs imagenet_cat.npz \
-#   --output predictions.npz  \
-#   --print-time \
-#   --repeat 100 \
-#   resnet50-v2-7-tvm_autotuned.tar
-#
-#   # Execution time summary:
-#   # mean (ms)   max (ms)    min (ms)    std (ms)
-#   #     92.19     115.73       89.85        3.15
-#
-#   tvmc run \
-#   --inputs imagenet_cat.npz \
-#   --output predictions.npz  \
-#   --print-time \
-#   --repeat 100 \
-#   resnet50-v2-7-tvm.tar
-#
-#   # Execution time summary:
-#   # mean (ms)   max (ms)    min (ms)    std (ms)
-#   #    193.32     219.97      185.04        7.11
-#
-
-
-################################################################################
-# Final Remarks
-# -------------
-#
-# In this tutorial, we presented TVMC, a command line driver for TVM. We
-# demonstrated how to compile, run, and tune a model. We also discussed the
-# need for pre and post-processing of inputs and outputs. After the tuning
-# process, we demonstrated how to compare the performance of the unoptimized
-# and optimize models.
-#
-# Here we presented a simple example using ResNet 50 V2 locally. However, TVMC
-# supports many more features including cross-compilation, remote execution and
-# profiling/benchmarking.
-#
-# To see what other options are available, please have a look at ``tvmc
-# --help``.
-#
-# In the next tutorial, `Compiling and Optimizing a Model with the Python
-# Interface <auto_tuning_with_pyton>`_, we will cover the same compilation
-# and optimization steps using the Python interface.
diff --git a/tutorials/language/README.txt b/tutorials/language/README.txt
deleted file mode 100644
index 6da8e3c57c1f..000000000000
--- a/tutorials/language/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Tensor Expression and Schedules
--------------------------------
diff --git a/tutorials/micro/README.txt b/tutorials/micro/README.txt
deleted file mode 100644
index 70a5e580ecd1..000000000000
--- a/tutorials/micro/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorial-micro:
-
-microTVM
---------
diff --git a/tutorials/topi/README.txt b/tutorials/topi/README.txt
deleted file mode 100644
index eae0aafafc02..000000000000
--- a/tutorials/topi/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-TOPI: TVM Operator Inventory
-----------------------------