From 135cc0318f2f02a24e94dcd5ba1fec765fa9664b Mon Sep 17 00:00:00 2001 From: Chris Hoge Date: Fri, 6 Aug 2021 17:11:40 +0000 Subject: [PATCH 1/2] Proof of concept for documentation refactor. Proposal for documentation refactor. Changes include: Major changes to top-level structure. Separation of user and developer docs. Reclassifying many tutorials as how-tos. Moving sphinx galleries to top level "gallery" directory. Consistent naming between galleries and top-level structure --- docs/conf.py | 16 +- docs/dev_deep_dive/index.rst | 25 + docs/dev_how_tos/index.rst | 25 + docs/dev_reference/index.rst | 24 + docs/dev_tutorials/index.rst | 24 + docs/index.rst | 51 +- docs/user_deep_dive/index.rst | 24 + docs/user_how_tos/index.rst | 23 + docs/user_reference/index.rst | 23 + docs/user_tutorials/index.rst | 25 + gallery/user_tutorials/README.txt | 4 + .../auto_scheduler_matmul_x86.py | 214 +++++ gallery/user_tutorials/autotvm_matmul_x86.py | 377 ++++++++ gallery/user_tutorials/autotvm_relay_x86.py | 476 +++++++++ .../cross_compilation_and_rpc.py | 265 +++++ gallery/user_tutorials/install.py | 50 + gallery/user_tutorials/introduction.py | 134 +++ gallery/user_tutorials/relay_quick_start.py | 155 +++ .../user_tutorials/tensor_expr_get_started.py | 903 ++++++++++++++++++ .../tvmc_command_line_driver.py | 511 ++++++++++ 20 files changed, 3308 insertions(+), 41 deletions(-) create mode 100644 docs/dev_deep_dive/index.rst create mode 100644 docs/dev_how_tos/index.rst create mode 100644 docs/dev_reference/index.rst create mode 100644 docs/dev_tutorials/index.rst create mode 100644 docs/user_deep_dive/index.rst create mode 100644 docs/user_how_tos/index.rst create mode 100644 docs/user_reference/index.rst create mode 100644 docs/user_tutorials/index.rst create mode 100644 gallery/user_tutorials/README.txt create mode 100644 gallery/user_tutorials/auto_scheduler_matmul_x86.py create mode 100644 gallery/user_tutorials/autotvm_matmul_x86.py create mode 100644 gallery/user_tutorials/autotvm_relay_x86.py create mode 100644 gallery/user_tutorials/cross_compilation_and_rpc.py create mode 100644 gallery/user_tutorials/install.py create mode 100644 gallery/user_tutorials/introduction.py create mode 100644 gallery/user_tutorials/relay_quick_start.py create mode 100644 gallery/user_tutorials/tensor_expr_get_started.py create mode 100644 gallery/user_tutorials/tvmc_command_line_driver.py diff --git a/docs/conf.py b/docs/conf.py index 4a0455214db3..6962c448ed53 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -197,11 +197,12 @@ def git_describe_version(original_version): from sphinx_gallery.sorting import ExplicitOrder -examples_dirs = ["../tutorials/", "../vta/tutorials/"] -gallery_dirs = ["tutorials", "vta/tutorials"] +examples_dirs = ["../gallery/user_tutorials", "../tutorials/", "../vta/tutorials/"] +gallery_dirs = ["user_tutorials", "tutorials", "vta/tutorials"] subsection_order = ExplicitOrder( [ + "../gallery/user_tutorials", "../tutorials/get_started", "../tutorials/frontend", "../tutorials/language", @@ -223,6 +224,17 @@ def git_describe_version(original_version): # The unlisted files are sorted by filenames. # The unlisted files always appear after listed files. within_subsection_order = { + "user_tutorials": [ + "introduction.py", + "install.py", + "tvmc_command_line_driver.py", + "autotvm_relay_x86.py", + "tensor_expr_get_started.py", + "autotvm_matmul_x86.py", + "auto_scheduler_matmul_x86.py", + "cross_compilation_and_rpc.py", + "relay_quick_start.py", + ], "get_started": [ "introduction.py", "install.py", diff --git a/docs/dev_deep_dive/index.rst b/docs/dev_deep_dive/index.rst new file mode 100644 index 000000000000..41189226b8c1 --- /dev/null +++ b/docs/dev_deep_dive/index.rst @@ -0,0 +1,25 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _deep_dive: + +Deep Dive +========= +TVM Developer Deep Dives + +Refactor placeholder + diff --git a/docs/dev_how_tos/index.rst b/docs/dev_how_tos/index.rst new file mode 100644 index 000000000000..e8cce9365df2 --- /dev/null +++ b/docs/dev_how_tos/index.rst @@ -0,0 +1,25 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _reference: + +Deep Dive +========= +TVM Developer Deep Dive + +Refactor placeholder + diff --git a/docs/dev_reference/index.rst b/docs/dev_reference/index.rst new file mode 100644 index 000000000000..8a118d442e0b --- /dev/null +++ b/docs/dev_reference/index.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _reference: + +Reference +========= +TVM Developer Reference + +Placeholder for docs refactor diff --git a/docs/dev_tutorials/index.rst b/docs/dev_tutorials/index.rst new file mode 100644 index 000000000000..2705f21907b0 --- /dev/null +++ b/docs/dev_tutorials/index.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _tutorial: + +Tutorial +======== +TVM Developer Tutorial + +Placeholder for documentation refactor diff --git a/docs/index.rst b/docs/index.rst index 491c42712e9a..ad1cfc49d4dd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -34,57 +34,30 @@ For Developers architecture of TVM and/or actively develop on the project. - :doc:`dev/how_to` gives quick development tips on various topics. - -.. toctree:: - :maxdepth: 1 - :caption: How to - :hidden: - - install/index - contribute/index - deploy/index - dev/how_to - microtvm/index - errors - faq - -.. toctree:: - :maxdepth: 1 - :caption: Tutorials - :hidden: - - tutorials/index - - -.. toctree:: - :maxdepth: 2 - :caption: References - :hidden: - - langref/index - api/python/index - api/links - .. toctree:: :maxdepth: 2 + :caption: TVM User Guide :hidden: - :caption: Deep Dive - dev/index + user_tutorials/index + user_how_tos/index + user_deep_dive/index + user_reference/index .. toctree:: - :maxdepth: 2 + :maxdepth: 1 + :caption: TVM Developer Guide :hidden: - :caption: MISC - - vta/index - profiling/index + dev_tutorials/index + dev_how_tos/index + dev_deep_dive/index + dev_reference/index Index ----- - .. toctree:: :maxdepth: 1 + :caption: Index genindex diff --git a/docs/user_deep_dive/index.rst b/docs/user_deep_dive/index.rst new file mode 100644 index 000000000000..2e116ddbcc70 --- /dev/null +++ b/docs/user_deep_dive/index.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _user_guide: + +Deep Dive +========= +TVM User Deep Dives + +refactor placeholder until content is in place diff --git a/docs/user_how_tos/index.rst b/docs/user_how_tos/index.rst new file mode 100644 index 000000000000..cc59879e1057 --- /dev/null +++ b/docs/user_how_tos/index.rst @@ -0,0 +1,23 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _user_guide: + +How To +======= +TVM How Tos + diff --git a/docs/user_reference/index.rst b/docs/user_reference/index.rst new file mode 100644 index 000000000000..cc59879e1057 --- /dev/null +++ b/docs/user_reference/index.rst @@ -0,0 +1,23 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _user_guide: + +How To +======= +TVM How Tos + diff --git a/docs/user_tutorials/index.rst b/docs/user_tutorials/index.rst new file mode 100644 index 000000000000..4f01312426f2 --- /dev/null +++ b/docs/user_tutorials/index.rst @@ -0,0 +1,25 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _user_guide: + +Tutorial +========= +TVM User Tutorial + +This is a placeholder and is overwritten by Spinx Gallery +/tvm/gallery/user_tutorials diff --git a/gallery/user_tutorials/README.txt b/gallery/user_tutorials/README.txt new file mode 100644 index 000000000000..0d5a287feff4 --- /dev/null +++ b/gallery/user_tutorials/README.txt @@ -0,0 +1,4 @@ +Tutorials +--------- + +Within this gallery is an introduction to TVM. diff --git a/gallery/user_tutorials/auto_scheduler_matmul_x86.py b/gallery/user_tutorials/auto_scheduler_matmul_x86.py new file mode 100644 index 000000000000..f9fb3615aedc --- /dev/null +++ b/gallery/user_tutorials/auto_scheduler_matmul_x86.py @@ -0,0 +1,214 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Optimizing Operators with Auto-scheduling +========================================= +**Author**: `Lianmin Zheng `_, \ + `Chengfan Jia `_ + +In this tutorial, we will show how TVM's Auto Scheduling feature can find +optimal schedules without the need for writing a custom template. + +Different from the template-based :doc:`AutoTVM ` which relies on +manual templates to define the search space, the auto-scheduler does not +require any templates. Users only need to write the computation declaration +without any schedule commands or templates. The auto-scheduler can +automatically generate a large search space and find a good schedule in the +space. + +We use matrix multiplication as an example in this tutorial. + +.. note:: + Note that this tutorial will not run on Windows or recent versions of macOS. To + get it to run, you will need to wrap the body of this tutorial in a :code:`if + __name__ == "__main__":` block. +""" + +import os + +import numpy as np +import tvm +from tvm import te, auto_scheduler + +################################################################################ +# Defining the Matrix Multiplication +# ---------------------------------- +# To start, we define a matrix multiplication with a bias addition. Note that +# this uses standard operations available in TVMs Tensor Expression language. +# The major difference is the use of the `auto_sceduler` decorator at the top +# of the function definition. The function should return a list of +# input/output tensors. From these tensors, the auto-scheduler can get the +# whole computational graph. + + +@auto_scheduler.register_workload # Note the auto_scheduler decorator +def matmul_add(N, L, M, dtype): + A = te.placeholder((N, L), name="A", dtype=dtype) + B = te.placeholder((L, M), name="B", dtype=dtype) + C = te.placeholder((N, M), name="C", dtype=dtype) + + k = te.reduce_axis((0, L), name="k") + matmul = te.compute( + (N, M), + lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), + name="matmul", + attrs={"layout_free_placeholders": [B]}, # enable automatic layout transform for tensor B + ) + out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out") + + return [A, B, C, out] + + +################################################################################ +# Create the search task +# ---------------------- +# With the function defined, we can now create the task for the auto_scheduler +# to search against. We specify the particular parameters for this matrix +# multiplication, in this case a multiplication of to square matricies of size +# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32" +# +# .. note:: Improve performance with custom targets +# In order for TVM to take full advantage of specific hardware platforms, +# you will want to manuall specify your CPU capabilities. For example: +# - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2 +# - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512 + +target = tvm.target.Target("llvm") +N = L = M = 1024 +task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target) + +# Inspect the computational graph +print("Computational DAG:") +print(task.compute_dag) + +################################################################################ +# Set Parameters for Auto-Scheduler +# --------------------------------- +# Next, we set parameters for the auto-scheduler. +# +# * :code:`num_measure_trials` is the number of measurement trials we can use +# during the search. We only make 10 trials in this tutorial for a fast +# demonstration. In practice, 1000 is a good value for the search to converge. +# You can do more trials according to your time budget. +# * In addition, we use :code:`RecordToFile` to log measurement records into a +# file `matmul.json`. The measurement records can be used to query the history +# best, resume the search, and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions` for more parameters + +log_file = "matmul.json" +tune_option = auto_scheduler.TuningOptions( + num_measure_trials=10, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, +) + +################################################################################ +# Run the search +# -------------- +# Now we get all inputs ready. Pretty simple, isn't it? We can kick off the +# search and let the auto-scheduler do its magic. After some measurement +# trials, we can load the best schedule from the log file and apply it. + +# Run auto-tuning (search) +task.tune(tune_option) +# Apply the best schedule +sch, args = task.apply_best(log_file) + +################################################################################ +# Inspecting the Optimized Schedule +# --------------------------------- +# We can lower the schedule to see the IR after auto-scheduling. The +# auto-scheduler correctly performs optimizations including multi-level tiling, +# layout transformation, parallelization, vectorization, unrolling, and +# operator fusion. + +print("Lowered TIR:") +print(tvm.lower(sch, args, simple_mode=True)) + +################################################################################ +# Check correctness and evaluate performance +# ------------------------------------------ +# We build the binary and check its correctness and performance. + +func = tvm.build(sch, args, target) +a_np = np.random.uniform(size=(N, L)).astype(np.float32) +b_np = np.random.uniform(size=(L, M)).astype(np.float32) +c_np = np.random.uniform(size=(N, M)).astype(np.float32) +out_np = a_np.dot(b_np) + c_np + +dev = tvm.cpu() +a_tvm = tvm.nd.array(a_np, device=dev) +b_tvm = tvm.nd.array(b_np, device=dev) +c_tvm = tvm.nd.array(c_np, device=dev) +out_tvm = tvm.nd.empty(out_np.shape, device=dev) +func(a_tvm, b_tvm, c_tvm, out_tvm) + +# Check results +np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3) + +# Evaluate execution time. +evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) +print( + "Execution time of this operator: %.3f ms" + % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000) +) + + +################################################################################ +# Using the record file +# --------------------- +# During the search, all measurement records are logged into the record file +# "matmul.json". The measurement records can be used to re-apply search +# results, resume the search, and perform other analyses. +# +# Here is an example where we load the best schedule from a file, and print the +# equivalent python schedule API. This can be used for debugging and learning +# the behavior of the auto-scheduler. + +print("Equivalent python schedule:") +print(task.print_best(log_file)) + +################################################################################ +# A more complicated example is to resume the search. In this case, we need to +# create the search policy and cost model by ourselves and resume the status of +# search policy and cost model with the log file. In the example below we +# resume the status and do more 5 trials. + + +def resume_search(task, log_file): + print("Resume search:") + cost_model = auto_scheduler.XGBModel() + cost_model.update_from_file(log_file) + search_policy = auto_scheduler.SketchPolicy( + task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] + ) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)] + ) + task.tune(tune_option, search_policy=search_policy) + + +resume_search(task, log_file) + +################################################################################ +# Final Notes and Summary +# ----------------------- +# In this tutorial, we have shown how to use the TVM Auto-Scheduler to +# automatically optimize a matrix multiplication, without the need to specify a +# search template. It ends a series of examples that starts from the Tensor +# Expression (TE) language that demonstrates how TVM can optimize computational +# operations. diff --git a/gallery/user_tutorials/autotvm_matmul_x86.py b/gallery/user_tutorials/autotvm_matmul_x86.py new file mode 100644 index 000000000000..f9b33b894192 --- /dev/null +++ b/gallery/user_tutorials/autotvm_matmul_x86.py @@ -0,0 +1,377 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +.. _tutorial-autotvm-matmul-x86: + +Optimizing Operators with Schedule Templates and AutoTVM +======================================================== +**Authors**: +`Lianmin Zheng `_, +`Chris Hoge `_ + +In this tutorial, we show how the TVM Tensor Expression (TE) language +can be used to write schedule templates that can be searched by AutoTVM to +find the optimal schedule. This process is called Auto-Tuning, which helps +automate the process of optimizing tensor computation. + +This tutorial builds on the previous `tutorial on how to write a matrix +multiplication using TE `. + +There are two steps in auto-tuning. + +- The first step is defining a search space. +- The second step is running a search algorithm to explore through this space. + +In this tutorial, you can learn how to perform these two steps in TVM. The whole +workflow is illustrated by a matrix multiplication example. + +.. note:: + Note that this tutorial will not run on Windows or recent versions of macOS. + To get it to run, you will need to wrap the body of this tutorial in a + :code:`if __name__ == "__main__":` block. +""" + +################################################################################ +# Install dependencies +# -------------------- +# To use autotvm package in TVM, we need to install some extra dependencies. +# +# .. code-block:: bash +# +# pip3 install --user psutil xgboost cloudpickle +# +# To make TVM run faster in tuning, it is recommended to use cython as FFI of +# TVM. In the root directory of TVM, execute: +# +# .. code-block:: bash +# +# pip3 install --user cython +# sudo make cython3 +# +# Now return to python code. Begin by importing the required packages. + +import logging +import sys + +import numpy as np +import tvm +from tvm import te +import tvm.testing + +# the module is called `autotvm` +from tvm import autotvm + +################################################################################ +# Basic Matrix Multiplication with TE +# ----------------------------------- +# Recall the basic implementation of matrix multiplication using TE. We write +# it down here with a few changes. We will wrap the multiplication in a python +# function definition. For simplicity, we will focus our attention on a split +# optimization, using a fixed value that defines the block size of the +# reordering. + + +def matmul_basic(N, L, M, dtype): + + A = te.placeholder((N, L), name="A", dtype=dtype) + B = te.placeholder((L, M), name="B", dtype=dtype) + + k = te.reduce_axis((0, L), name="k") + C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") + s = te.create_schedule(C.op) + + # schedule + y, x = s[C].op.axis + k = s[C].op.reduce_axis[0] + + yo, yi = s[C].split(y, 8) + xo, xi = s[C].split(x, 8) + + s[C].reorder(yo, xo, k, yi, xi) + + return s, [A, B, C] + + +################################################################################ +# Matrix Multiplication with AutoTVM +# ---------------------------------- +# In the previous schedule code, we use a constant "8" as the tiling factor. +# However, it might not be the best one because the best tiling factor depends +# on real hardware environment and input shape. +# +# If you want the schedule code to be portable across a wider range of input +# shapes and target hardware, it is better to define a set of candidate values +# and pick the best one according to the measurement results on target +# hardware. +# +# In autotvm, we can define a tunable parameter, or a "knob" for such kind of +# value. + +################################################################################ +# A Basic Matrix Multiplication Template +# -------------------------------------- +# We begin with an example of how to create a tunable parameter set for the +# block size of the `split` scheduling operation. + +# Matmul V1: List candidate values +@autotvm.template("tutorial/matmul_v1") # 1. use a decorator +def matmul_v1(N, L, M, dtype): + A = te.placeholder((N, L), name="A", dtype=dtype) + B = te.placeholder((L, M), name="B", dtype=dtype) + + k = te.reduce_axis((0, L), name="k") + C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") + s = te.create_schedule(C.op) + + # schedule + y, x = s[C].op.axis + k = s[C].op.reduce_axis[0] + + # 2. get the config object + cfg = autotvm.get_config() + + # 3. define search space + cfg.define_knob("tile_y", [1, 2, 4, 8, 16]) + cfg.define_knob("tile_x", [1, 2, 4, 8, 16]) + + # 4. schedule according to config + yo, yi = s[C].split(y, cfg["tile_y"].val) + xo, xi = s[C].split(x, cfg["tile_x"].val) + + s[C].reorder(yo, xo, k, yi, xi) + + return s, [A, B, C] + + +################################################################################ +# Here we make four modifications to the previous schedule code and get a +# tunable "template". We can explain the modifications one by one. +# +# 1. Use a decorator to mark this function as a simple template. +# 2. Get a config object: You can regard this :code:`cfg` as an argument of +# this function but we obtain it in a different way. With this argument, this +# function is no longer a deterministic schedule. Instead, we can pass +# different configurations to this function and get different schedules. A +# function that uses a configuration object like this is called a "template". +# +# To make the template function more compact, we can do two things to define +# the parameter search space within a single function. +# +# 1. Define a search space across a set values. This is done by making +# :code:`cfg` a :any:`ConfigSpace` object. It will collect all of the +# tunable knobs in this function and build a search space from it. +# 2. Schedule according to an entity in this space. This is done by making +# :code:`cfg` a :any:`ConfigEntity` object. When it is a +# :any:`ConfigEntity`, it will ignore all space definition API (namely, +# :code:`cfg.define_XXXXX(...)`). Instead, it will store deterministic +# values for all tunable knobs, and we schedule according to these values. +# +# During auto-tuning, we will first call this template with a +# :any:`ConfigSpace` object to build the search space. Then we call this +# template with different :any:`ConfigEntity` in the built space to get +# different schedules. Finally we will measure the code generated by +# different schedules and pick the best one. +# +# 3. Define two tunable knobs. The first one is :code:`tile_y` with 5 possible +# values. The second one is :code:`tile_x` with a same list of possible values. +# These two knobs are independent, so they span a search space with size 25 = +# 5x5. +# 4. The configuration knobs are passed to the :code:`split` schedule +# operation, allowing us to schedule according to the 5x5 deterministic values +# we previously defined in :code:`cfg`. + +################################################################################ +# A Matrix Multiplication Template with the Advanced Parameter API +# ---------------------------------------------------------------- +# In the previous template, we manually listed all of the possible values for a +# knob. This is the lowest level API to define the space, and gives an explicit +# enumeration of the parameter space to search. However, we also provide +# another set of APIs that can make the definition of the search space easier +# and smarter. Where possible, we receomment you use this higher-level API +# +# In the following example, we use :any:`ConfigSpace.define_split` to define a +# split knob. It will enumerate all the possible ways to split an axis and +# construct the space. +# +# We also have :any:`ConfigSpace.define_reorder` for reorder knob and +# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization, +# thread binding. When the high level API cannot meet your requirements, you +# can always fall back to using the low level API. + + +@autotvm.template("tutorial/matmul") +def matmul(N, L, M, dtype): + A = te.placeholder((N, L), name="A", dtype=dtype) + B = te.placeholder((L, M), name="B", dtype=dtype) + + k = te.reduce_axis((0, L), name="k") + C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") + s = te.create_schedule(C.op) + + # schedule + y, x = s[C].op.axis + k = s[C].op.reduce_axis[0] + + ##### define space begin ##### + cfg = autotvm.get_config() + cfg.define_split("tile_y", y, num_outputs=2) + cfg.define_split("tile_x", x, num_outputs=2) + ##### define space end ##### + + # schedule according to config + yo, yi = cfg["tile_y"].apply(s, C, y) + xo, xi = cfg["tile_x"].apply(s, C, x) + + s[C].reorder(yo, xo, k, yi, xi) + + return s, [A, B, C] + + +################################################################################ +# .. note:: More Explanation on :code:`cfg.define_split` +# +# In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will +# enumerate all possible combinations that can split axis y into two axes with +# factors of the length of y. For example, if the length of y is 32 and we +# want to split it into two axes using factors of 32, then there are 6 +# possible values for (length of outer axis, length of inner axis) pair, +# namely (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). These are all 6 +# possible values of `tile_y`. +# +# During scheduling, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object. +# We stores the lengths of outer axes and inner axes in +# :code:`cfg['tile_y'].size` (a tuple with two elements). In this template, +# we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`. +# Actually, this is equivalent to :code:`yo, yi = s[C].split(y, +# cfg["tile_y"].size[1])` or :code:`yo, yi = s[C].split(y, +# nparts=cfg['tile_y"].size[0])` +# +# The advantage of using cfg.apply API is that it makes multi-level splits +# (that is, when num_outputs >= 3) easier. + +################################################################################ +# Step 2: Use AutoTVM to Optimize the Matrix Multiplication +# --------------------------------------------------------- +# In Step 1, we wrote a matrix multiplication template that allowed us to +# paramaterize the block size used in the `split` schedule. We can now conduct +# a search over this parameter space. The next step is to pick a tuner to guide +# the exploration of this space. +# +# Auto-tuners in TVM +# ~~~~~~~~~~~~~~~~~~ +# The job for a tuner can be described by following pseudo code +# +# .. code-block:: c +# +# ct = 0 +# while ct < max_number_of_trials: +# propose a batch of configs +# measure this batch of configs on real hardware and get results +# ct += batch_size +# +# When proposing the next batch of configs, the tuner can take different +# strategies. Some of the tuner strategies provided by TVM include: +# +# * :any:`tvm.autotvm.tuner.RandomTuner`: Enumerate the space in a random order +# * :any:`tvm.autotvm.tuner.GridSearchTuner`: Enumerate the space in a grid search order +# * :any:`tvm.autotvm.tuner.GATuner`: Using genetic algorithm to search through the space +# * :any:`tvm.autotvm.tuner.XGBTuner`: Uses a model based method. Train a XGBoost model to +# predict the speed of lowered IR and pick the next batch according to the +# prediction. +# +# You can choose the tuner according to the size of your space, your time +# budget and other factors. For example, if your space is very small (less +# than 1000), a gridsearch tuner or a random tuner is good enough. If your +# space is at the level of 10^9 (this is the space size of a conv2d operator on +# CUDA GPU), XGBoostTuner can explore more efficiently and find better configs. + +################################################################################ +# Begin tuning +# ~~~~~~~~~~~~ +# Here we continue our matrix multiplication example. First we create a tuning +# task. We can also inspect the initialized search space. In this case, for a +# 512x512 square matrix multiplication, the space size is 10x10=100 Note that +# the task and search space are independent of the tuner picked. + +N, L, M = 512, 512, 512 +task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm") +print(task.config_space) + +################################################################################ +# Then we need to define how to measure the generated code and pick a tuner. +# Since our space is small, a random tuner is just okay. +# +# We only make 10 trials in this tutorial for demonstration. In practice, you +# can do more trials according to your time budget. We will log the tuning +# results into a log file. This file can be used to choose the best +# configuration discovered by the tuner later. + +# logging config (for printing tuning log to the screen) +logging.getLogger("autotvm").setLevel(logging.DEBUG) +logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) + +################################################################################ +# There are two steps for measuring a config: build and run. By default, we use +# all CPU cores to compile program. We then measure them sequentially. To help +# reduce variance, we take 5 measurements and average them. +measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5)) + +# Begin tuning with RandomTuner, log records to file `matmul.log` +# You can use alternatives like XGBTuner. +tuner = autotvm.tuner.RandomTuner(task) +tuner.tune( + n_trial=10, + measure_option=measure_option, + callbacks=[autotvm.callback.log_to_file("matmul.log")], +) + +################################################################################ +# With tuning completed, we can choose the configuration from the log file that +# has the best measured performance and compile the schedule with the +# corresponding parameters. We also do a quick verfication that the schedule is +# producing correct answers. We can call the function :code:`matmul` directly +# under the :any:`autotvm.apply_history_best` context. When we call this +# function, it will query the dispatch context with its argument and get the +# best config with the same argument. + +# apply history best from log file +with autotvm.apply_history_best("matmul.log"): + with tvm.target.Target("llvm"): + s, arg_bufs = matmul(N, L, M, "float32") + func = tvm.build(s, arg_bufs) + +# check correctness +a_np = np.random.uniform(size=(N, L)).astype(np.float32) +b_np = np.random.uniform(size=(L, M)).astype(np.float32) +c_np = a_np.dot(b_np) + +c_tvm = tvm.nd.empty(c_np.shape) +func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) + +tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-4) + +################################################################################ +# Final Notes and Summary +# ----------------------- +# In this tutorial, we have shown how to build operator templates that allow +# TVM to search a parameter space and choose optimized schedule configurations. +# To gain a deeper understanding of how this works, we recommend expanding on +# this example by adding new search parameters to the schedule based on +# schedule operations demonstated in the `Getting Started With Tensor +# Expressions _` tutorial. In the upcoming sections, we +# will demonstate the AutoScheduler, a method for TVM to optimize common +# operators without the need for the user to provide a user-defined template. diff --git a/gallery/user_tutorials/autotvm_relay_x86.py b/gallery/user_tutorials/autotvm_relay_x86.py new file mode 100644 index 000000000000..67faec4505a6 --- /dev/null +++ b/gallery/user_tutorials/autotvm_relay_x86.py @@ -0,0 +1,476 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Compiling and Optimizing a Model with the Python Interface (AutoTVM) +==================================================================== +**Author**: +`Chris Hoge `_ + +In the `TVMC Tutorial `_, we covered how to compile, run, and tune a +pre-trained vision model, ResNet-50-v2 using the command line interface for +TVM, TVMC. TVM is more that just a command-line tool though, it is an +optimizing framework with APIs available for a number of different languages +that gives you tremendous flexibility in working with machine learning models. + +In this tutorial we will cover the same ground we did with TVMC, but show how +it is done with the Python API. Upon completion of this section, we will have +used the Python API for TVM to accomplish the following tasks: + +* Compile a pre-trained ResNet 50 v2 model for the TVM runtime. +* Run a real image through the compiled model, and interpret the output and model + performance. +* Tune the model that model on a CPU using TVM. +* Re-compile an optimized model using the tuning data collected by TVM. +* Run the image through the optimized model, and compare the output and model + performance. + +The goal of this section is to give you an overview of TVM's capabilites and +how to use them through the Python API. +""" + +################################################################################ +# TVM is a deep learning compiler framework, with a number of different modules +# available for working with deep learning models and operators. In this +# tutorial we will work through how to load, compile, and optimize a model +# using the Python API. +# +# We begin by importing a number of dependencies, including ``onnx`` for +# loading and converting the model, helper utilities for downloading test data, +# the Python Image Library for working with the image data, ``numpy`` for pre +# and post-processing of the image data, the TVM Relay framework, and the TVM +# Graph Executor. + +import onnx +from tvm.contrib.download import download_testdata +from PIL import Image +import numpy as np +import tvm.relay as relay +import tvm +from tvm.contrib import graph_executor + +################################################################################ +# Downloading and Loading the ONNX Model +# -------------------------------------- +# +# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a +# convolutional neural network that is 50-layers deep and designed to classify +# images. The model we will be using has been pre-trained on more than a +# million images with 1000 different classifications. The network has an input +# image size of 224x224. If you are interested exploring more of how the +# ResNet-50 model is structured, we recommend downloading +# `Netron `_, a freely available ML model viewer. +# +# TVM provides a helper library to download pre-trained models. By providing a +# model URL, file name, and model type through the module, TVM will download +# the model and save it to disk. For the instance of an ONNX model, you can +# then load it into memory using the ONNX runtime. +# +# .. note:: Working with Other Model Formats +# +# TVM supports many popular model formats. A list can be found in the `Compile +# Deep Learning Models +# `_ +# section of the TVM Documentation. + +model_url = "".join( + [ + "https://github.com/onnx/models/raw/", + "master/vision/classification/resnet/model/", + "resnet50-v2-7.onnx", + ] +) + +model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx") +onnx_model = onnx.load(model_path) + +################################################################################ +# Downloading, Preprocessing, and Loading the Test Image +# ------------------------------------------------------ +# +# Each model is particular when it comes to expected tensor shapes, formats and +# data types. For this reason, most models require some pre and +# post-processing, to ensure the input is valid and to interpret the output. +# TVMC has adopted NumPy's ``.npz`` format for both input and output data. +# +# As input for this tutorial, we will use the image of a cat, but you can feel +# free to substitute image for any of your choosing. +# +# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg +# :height: 224px +# :width: 224px +# :align: center +# +# Download the image data, then convert it to a numpy array to use as an input to the model. + +img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" +img_path = download_testdata(img_url, "imagenet_cat.png", module="data") + +# Resize it to 224x224 +resized_image = Image.open(img_path).resize((224, 224)) +img_data = np.asarray(resized_image).astype("float32") + +# Our input image is in HWC layout while ONNX expects CHW input, so convert the array +img_data = np.transpose(img_data, (2, 0, 1)) + +# Normalize according to the ImageNet input specification +imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) +imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) +norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev + +# Add the batch dimension, as we are expecting 4-dimensional input: NCHW. +img_data = np.expand_dims(norm_img_data, axis=0) + +############################################################################### +# Compile the Model With Relay +# ---------------------------- +# +# The next step is to compile the ResNet model. We begin by importing the model +# to relay using the `from_onnx` importer. We then build the model, with +# standard optimizations, into a TVM library. Finally, we create a TVM graph +# runtime module from the library. + +target = "llvm" + +###################################################################### +# .. note:: Defining the Correct Target +# +# Specifying the correct target can have a huge impact on the performance of +# the compiled module, as it can take advantage of hardware features +# available on the target. For more information, please refer to `Auto-tuning +# a convolutional network for x86 CPU +# `_. +# We recommend identifying which CPU you are running, along with optional +# features, and set the target appropriately. For example, for some +# processors ``target = "llvm -mcpu=skylake"``, or ``target = "llvm +# -mcpu=skylake-avx512"`` for processors with the AVX-512 vector instruction +# set. +# + +# The input name may vary across model types. You can use a tool +# like netron to check input names +input_name = "data" +shape_dict = {input_name: img_data.shape} + +mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) + +with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params=params) + +dev = tvm.device(str(target), 0) +module = graph_executor.GraphModule(lib["default"](dev)) + +###################################################################### +# Execute on the TVM Runtime +# -------------------------- +# Now that we've compiled the model, we can use the TVM runtime to make +# predictions with it. To use TVM to run the model and make predictions, we +# need two things: +# +# - The compiled model, which we just produced. +# - Valid input to the model to make predictions on. + +dtype = "float32" +module.set_input(input_name, img_data) +module.run() +output_shape = (1, 1000) +tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy() + +################################################################################ +# Collect Basic Performance Data +# ------------------------------ +# We want to collect some basic performance data associated with this +# unoptimized model and compare it to a tuned model later. To help account for +# CPU noise, we run the computation in multiple batches in multiple +# repetitions, then gather some basis statistics on the mean, median, and +# standard deviation. +import timeit + +timing_number = 10 +timing_repeat = 10 +unoptimized = ( + np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number)) + * 1000 + / timing_number +) +unoptimized = { + "mean": np.mean(unoptimized), + "median": np.median(unoptimized), + "std": np.std(unoptimized), +} + +print(unoptimized) + +################################################################################ +# Postprocess the output +# ---------------------- +# +# As previously mentioned, each model will have its own particular way of +# providing output tensors. +# +# In our case, we need to run some post-processing to render the outputs from +# ResNet-50-V2 into a more human-readable form, using the lookup-table provided +# for the model. + +from scipy.special import softmax + +# Download a list of labels +labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" +labels_path = download_testdata(labels_url, "synset.txt", module="data") + +with open(labels_path, "r") as f: + labels = [l.rstrip() for l in f] + +# Open the output and read the output tensor +scores = softmax(tvm_output) +scores = np.squeeze(scores) +ranks = np.argsort(scores)[::-1] +for rank in ranks[0:5]: + print("class='%s' with probability=%f" % (labels[rank], scores[rank])) + +################################################################################ +# This should produce the following output: +# +# .. code-block:: bash +# +# # class='n02123045 tabby, tabby cat' with probability=0.610553 +# # class='n02123159 tiger cat' with probability=0.367179 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 + +################################################################################ +# Tune the model +# -------------- +# The previous model was compiled to work on the TVM runtime, but did not +# include any platform specific optimization. In this section, we will show you +# how to build an optimized model using TVM to target your working platform. +# +# In some cases, we might not get the expected performance when running +# inferences using our compiled module. In cases like this, we can make use of +# the auto-tuner, to find a better configuration for our model and get a boost +# in performance. Tuning in TVM refers to the process by which a model is +# optimized to run faster on a given target. This differs from training or +# fine-tuning in that it does not affect the accuracy of the model, but only +# the runtime performance. As part of the tuning process, TVM will try running +# many different operator implementation variants to see which perform best. +# The results of these runs are stored in a tuning records file. +# +# In the simplest form, tuning requires you to provide three things: +# +# - the target specification of the device you intend to run this model on +# - the path to an output file in which the tuning records will be stored +# - a path to the model to be tuned. +# + +import tvm.auto_scheduler as auto_scheduler +from tvm.autotvm.tuner import XGBTuner +from tvm import autotvm + +# Set up some basic parameters for the runner. The runner takes compiled code +# that is generated with a specific set of parameters and measures the +# performance of it. ``number`` specifies the number of different +# configurations that we will test, while ``repeat`` specifies how many +# measurements we will take of each configuration. ``min_repeat_ms`` is a value +# that specifies how long need to run configuration test. If the number of +# repeats falls under this time, it will be increased. This option is necessary +# for accurate tuning on GPUs, and is not required for CPU tuning. Setting this +# value to 0 disables it. The ``timeout`` places an upper limit on how long to +# run training code for each tested configuration. + +number = 10 +repeat = 1 +min_repeat_ms = 0 # since we're tuning on a CPU, can be set to 0 +timeout = 10 # in seconds + +# create a TVM runner +runner = autotvm.LocalRunner( + number=number, + repeat=repeat, + timeout=timeout, + min_repeat_ms=min_repeat_ms, + enable_cpu_cache_flush=True, +) + +# Create a simple structure for holding tuning options. We use an XGBoost +# algorithim for guiding the search. For a production job, you will want to set +# the number of trials to be larger than the value of 10 used here. For CPU we +# recommend 1500, for GPU 3000-4000. The number of trials required can depend +# on the particular model and processor, so it's worth spending some time +# evaluating performance across a range of values to find the best balance +# between tuning time and model optimization. Because running tuning is time +# intensive we set number of trials to 10, but do not recommend a value this +# small. The ``early_stopping`` parameter is the minimum number of trails to +# run before a condition that stops the search early can be applied. The +# measure option indicates where trial code will be built, and where it will be +# run. In this case, we're using the ``LocalRunner`` we just created and a +# ``LocalBuilder``. The ``tuning_records`` option specifies a file to write +# the tuning data to. + +tuning_option = { + "tuner": "xgb", + "trials": 10, + "early_stopping": 100, + "measure_option": autotvm.measure_option( + builder=autotvm.LocalBuilder(build_func="default"), runner=runner + ), + "tuning_records": "resnet-50-v2-autotuning.json", +} + +################################################################################ +# .. note:: Defining the Tuning Search Algorithm +# +# By default this search is guided using an `XGBoost Grid` algorithm. +# Depending on your model complexity and amount of time available, you might +# want to choose a different algorithm. + + +################################################################################ +# .. note:: Setting Tuning Parameters +# +# In this example, in the interest of time, we set the number of trials and +# early stopping to 10. You will likely see more performance improvements if +# you set these values to be higher but this comes at the expense of time +# spent tuning. The number of trials required for convergence will vary +# depending on the specifics of the model and the target platform. + +# begin by extracting the taks from the onnx model +tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params) + +# Tune the extracted tasks sequentially. +for i, task in enumerate(tasks): + prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) + tuner_obj = XGBTuner(task, loss_type="rank") + tuner_obj.tune( + n_trial=min(tuning_option["trials"], len(task.config_space)), + early_stopping=tuning_option["early_stopping"], + measure_option=tuning_option["measure_option"], + callbacks=[ + autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix), + autotvm.callback.log_to_file(tuning_option["tuning_records"]), + ], + ) + +################################################################################ +# The output from this tuning process will look something like this: +# +# .. code-block:: bash +# +# # [Task 1/24] Current/Best: 10.71/ 21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done. +# # [Task 1/24] Current/Best: 9.32/ 24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done. +# # [Task 2/24] Current/Best: 22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done. +# # [Task 3/24] Current/Best: 32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done. +# # [Task 4/24] Current/Best: 11.96/ 156.49 GFLOPS | Progress: (960/1000) | 632.26 s Done. +# # [Task 5/24] Current/Best: 23.75/ 130.78 GFLOPS | Progress: (800/1000) | 739.29 s Done. +# # [Task 6/24] Current/Best: 38.29/ 198.31 GFLOPS | Progress: (1000/1000) | 624.51 s Done. +# # [Task 7/24] Current/Best: 4.31/ 210.78 GFLOPS | Progress: (1000/1000) | 701.03 s Done. +# # [Task 8/24] Current/Best: 50.25/ 185.35 GFLOPS | Progress: (972/1000) | 538.55 s Done. +# # [Task 9/24] Current/Best: 50.19/ 194.42 GFLOPS | Progress: (1000/1000) | 487.30 s Done. +# # [Task 10/24] Current/Best: 12.90/ 172.60 GFLOPS | Progress: (972/1000) | 607.32 s Done. +# # [Task 11/24] Current/Best: 62.71/ 203.46 GFLOPS | Progress: (1000/1000) | 581.92 s Done. +# # [Task 12/24] Current/Best: 36.79/ 224.71 GFLOPS | Progress: (1000/1000) | 675.13 s Done. +# # [Task 13/24] Current/Best: 7.76/ 219.72 GFLOPS | Progress: (1000/1000) | 519.06 s Done. +# # [Task 14/24] Current/Best: 12.26/ 202.42 GFLOPS | Progress: (1000/1000) | 514.30 s Done. +# # [Task 15/24] Current/Best: 31.59/ 197.61 GFLOPS | Progress: (1000/1000) | 558.54 s Done. +# # [Task 16/24] Current/Best: 31.63/ 206.08 GFLOPS | Progress: (1000/1000) | 708.36 s Done. +# # [Task 17/24] Current/Best: 41.18/ 204.45 GFLOPS | Progress: (1000/1000) | 736.08 s Done. +# # [Task 18/24] Current/Best: 15.85/ 222.38 GFLOPS | Progress: (980/1000) | 516.73 s Done. +# # [Task 19/24] Current/Best: 15.78/ 203.41 GFLOPS | Progress: (1000/1000) | 587.13 s Done. +# # [Task 20/24] Current/Best: 30.47/ 205.92 GFLOPS | Progress: (980/1000) | 471.00 s Done. +# # [Task 21/24] Current/Best: 46.91/ 227.99 GFLOPS | Progress: (308/1000) | 219.18 s Done. +# # [Task 22/24] Current/Best: 13.33/ 207.66 GFLOPS | Progress: (1000/1000) | 761.74 s Done. +# # [Task 23/24] Current/Best: 53.29/ 192.98 GFLOPS | Progress: (1000/1000) | 799.90 s Done. +# # [Task 24/24] Current/Best: 25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done. + +################################################################################ +# Compiling an Optimized Model with Tuning Data +# ---------------------------------------------- +# +# As an output of the tuning process above, we obtained the tuning records +# stored in ``resnet-50-v2-autotuning.json``. The compiler will use the results to +# generate high performance code for the model on your specified target. +# +# Now that tuning data for the model has been collected, we can re-compile the +# model using optimized operators to speed up our computations. + +with autotvm.apply_history_best(tuning_option["tuning_records"]): + with tvm.transform.PassContext(opt_level=3, config={}): + lib = relay.build(mod, target=target, params=params) + +dev = tvm.device(str(target), 0) +module = graph_executor.GraphModule(lib["default"](dev)) + +################################################################################ +# Verify that the optimized model runs and produces the same results: + +dtype = "float32" +module.set_input(input_name, img_data) +module.run() +output_shape = (1, 1000) +tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy() + +scores = softmax(tvm_output) +scores = np.squeeze(scores) +ranks = np.argsort(scores)[::-1] +for rank in ranks[0:5]: + print("class='%s' with probability=%f" % (labels[rank], scores[rank])) + +# Verifying that the predictions are the same: +# +# .. code-block:: bash +# +# # class='n02123045 tabby, tabby cat' with probability=0.610550 +# # class='n02123159 tiger cat' with probability=0.367181 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 + +################################################################################ +# Comparing the Tuned and Untuned Models +# -------------------------------------- +# We want to collect some basic performance data associated with this optimized +# model to compare it to the unoptimized model. Depending on your underlying +# hardware, number of iterations, and other factors, you should see a performance +# improvement in comparing the optimized model to the unoptimized model. + +import timeit + +timing_number = 10 +timing_repeat = 10 +optimized = ( + np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number)) + * 1000 + / timing_number +) +optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)} + + +print("optimized: %s" % (optimized)) +print("unoptimized: %s" % (unoptimized)) + +################################################################################ +# Final Remarks +# ------------- +# +# In this tutorial, we gave a short example of how to use the TVM Python API +# to compile, run, and tune a model. We also discussed the need for pre and +# post-processing of inputs and outputs. After the tuning process, we +# demonstrated how to compare the performance of the unoptimized and optimize +# models. +# +# Here we presented a simple example using ResNet 50 V2 locally. However, TVM +# supports many more features including cross-compilation, remote execution and +# profiling/benchmarking. diff --git a/gallery/user_tutorials/cross_compilation_and_rpc.py b/gallery/user_tutorials/cross_compilation_and_rpc.py new file mode 100644 index 000000000000..25208369f74d --- /dev/null +++ b/gallery/user_tutorials/cross_compilation_and_rpc.py @@ -0,0 +1,265 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +.. _tutorial-cross-compilation-and-rpc: + +Cross Compilation and RPC +========================= +**Author**: `Ziheng Jiang `_, `Lianmin Zheng `_ + +This tutorial introduces cross compilation and remote device +execution with RPC in TVM. + +With cross compilation and RPC, you can **compile a program on your +local machine then run it on the remote device**. It is useful when +the remote device resource are limited, like Raspberry Pi and mobile +platforms. In this tutorial, we will use the Raspberry Pi for a CPU example +and the Firefly-RK3399 for an OpenCL example. +""" + +###################################################################### +# Build TVM Runtime on Device +# --------------------------- +# +# The first step is to build the TVM runtime on the remote device. +# +# .. note:: +# +# All instructions in both this section and the next section should be +# executed on the target device, e.g. Raspberry Pi. We assume the target +# is running Linux. +# +# Since we do compilation on the local machine, the remote device is only used +# for running the generated code. We only need to build the TVM runtime on +# the remote device. +# +# .. code-block:: bash +# +# git clone --recursive https://github.com/apache/tvm tvm +# cd tvm +# make runtime -j2 +# +# After building the runtime successfully, we need to set environment variables +# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc` +# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM +# directory is in :code:`~/tvm`): +# +# .. code-block:: bash +# +# export PYTHONPATH=$PYTHONPATH:~/tvm/python +# +# To update the environment variables, execute :code:`source ~/.bashrc`. + +###################################################################### +# Set Up RPC Server on Device +# --------------------------- +# To start an RPC server, run the following command on your remote device +# (Which is Raspberry Pi in this example). +# +# .. code-block:: bash +# +# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090 +# +# If you see the line below, it means the RPC server started +# successfully on your device. +# +# .. code-block:: bash +# +# INFO:root:RPCServer: bind to 0.0.0.0:9090 +# + +###################################################################### +# Declare and Cross Compile Kernel on Local Machine +# ------------------------------------------------- +# +# .. note:: +# +# Now we go back to the local machine, which has a full TVM installed +# (with LLVM). +# +# Here we will declare a simple kernel on the local machine: + +import numpy as np + +import tvm +from tvm import te +from tvm import rpc +from tvm.contrib import utils + +n = tvm.runtime.convert(1024) +A = te.placeholder((n,), name="A") +B = te.compute((n,), lambda i: A[i] + 1.0, name="B") +s = te.create_schedule(B.op) + +###################################################################### +# Then we cross compile the kernel. +# The target should be 'llvm -mtriple=armv7l-linux-gnueabihf' for +# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable +# on our webpage building server. See the detailed note in the following block. + +local_demo = True + +if local_demo: + target = "llvm" +else: + target = "llvm -mtriple=armv7l-linux-gnueabihf" + +func = tvm.build(s, [A, B], target=target, name="add_one") +# save the lib at a local temp folder +temp = utils.tempdir() +path = temp.relpath("lib.tar") +func.export_library(path) + +###################################################################### +# .. note:: +# +# To run this tutorial with a real remote device, change :code:`local_demo` +# to False and replace :code:`target` in :code:`build` with the appropriate +# target triple for your device. The target triple which might be +# different for different devices. For example, it is +# :code:`'llvm -mtriple=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and +# :code:`'llvm -mtriple=aarch64-linux-gnu'` for RK3399. +# +# Usually, you can query the target by running :code:`gcc -v` on your +# device, and looking for the line starting with :code:`Target:` +# (Though it may still be a loose configuration.) +# +# Besides :code:`-mtriple`, you can also set other compilation options +# like: +# +# * -mcpu= +# Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture. +# * -mattr=a1,+a2,-a3,... +# Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU. +# To get the list of available attributes, you can do: +# +# .. code-block:: bash +# +# llc -mtriple= -mattr=help +# +# These options are consistent with `llc `_. +# It is recommended to set target triple and feature set to contain specific +# feature available, so we can take full advantage of the features of the +# board. +# You can find more details about cross compilation attributes from +# `LLVM guide of cross compilation `_. + +###################################################################### +# Run CPU Kernel Remotely by RPC +# ------------------------------ +# We show how to run the generated CPU kernel on the remote device. +# First we obtain an RPC session from remote device. + +if local_demo: + remote = rpc.LocalSession() +else: + # The following is my environment, change this to the IP address of your target device + host = "10.77.1.162" + port = 9090 + remote = rpc.connect(host, port) + +###################################################################### +# Upload the lib to the remote device, then invoke a device local +# compiler to relink them. Now `func` is a remote module object. + +remote.upload(path) +func = remote.load_module("lib.tar") + +# create arrays on the remote device +dev = remote.cpu() +a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) +b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) +# the function will run on the remote device +func(a, b) +np.testing.assert_equal(b.numpy(), a.numpy() + 1) + +###################################################################### +# When you want to evaluate the performance of the kernel on the remote +# device, it is important to avoid the overhead of network. +# :code:`time_evaluator` will returns a remote function that runs the +# function over number times, measures the cost per run on the remote +# device and returns the measured cost. Network overhead is excluded. + +time_f = func.time_evaluator(func.entry_name, dev, number=10) +cost = time_f(a, b).mean +print("%g secs/op" % cost) + +######################################################################### +# Run OpenCL Kernel Remotely by RPC +# --------------------------------- +# For remote OpenCL devices, the workflow is almost the same as above. +# You can define the kernel, upload files, and run via RPC. +# +# .. note:: +# +# Raspberry Pi does not support OpenCL, the following code is tested on +# Firefly-RK3399. You may follow this `tutorial `_ +# to setup the OS and OpenCL driver for RK3399. +# +# Also we need to build the runtime with OpenCL enabled on rk3399 board. In the TVM +# root directory, execute +# +# .. code-block:: bash +# +# cp cmake/config.cmake . +# sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake +# make runtime -j4 +# +# The following function shows how we run an OpenCL kernel remotely + + +def run_opencl(): + # NOTE: This is the setting for my rk3399 board. You need to modify + # them according to your environment. + opencl_device_host = "10.77.1.145" + opencl_device_port = 9090 + target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu") + + # create schedule for the above "add one" compute declaration + s = te.create_schedule(B.op) + xo, xi = s[B].split(B.op.axis[0], factor=32) + s[B].bind(xo, te.thread_axis("blockIdx.x")) + s[B].bind(xi, te.thread_axis("threadIdx.x")) + func = tvm.build(s, [A, B], target=target) + + remote = rpc.connect(opencl_device_host, opencl_device_port) + + # export and upload + path = temp.relpath("lib_cl.tar") + func.export_library(path) + remote.upload(path) + func = remote.load_module("lib_cl.tar") + + # run + dev = remote.cl() + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) + func(a, b) + np.testing.assert_equal(b.numpy(), a.numpy() + 1) + print("OpenCL test passed!") + + +###################################################################### +# Summary +# ------- +# This tutorial provides a walk through of cross compilation and RPC +# features in TVM. +# +# - Set up an RPC server on the remote device. +# - Set up the target device configuration to cross compile the kernels on the +# local machine. +# - Upload and run the kernels remotely via the RPC API. diff --git a/gallery/user_tutorials/install.py b/gallery/user_tutorials/install.py new file mode 100644 index 000000000000..b69b8b493a4f --- /dev/null +++ b/gallery/user_tutorials/install.py @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Installing TVM +============== +**Authors**: +`Jocelyn Shiue `_, +`Chris Hoge `_ + +Depending on your needs and your working environment, there are a few different +methods for installing TVM. These include: + +* Installing from source +* Installing from third-party binary package. +""" + +################################################################################ +# Installing from Source +# ---------------------- +# Installing from source is the recommended method for installing TVM. It will +# allow you to enable specific features such as GPU support, microcontroller +# support (microTVM), and a debugging runtime, and other features. You will also +# want to install from source if you want to actively contribute to the TVM +# project. The full instructions are on the `Install TVM From Source +# `_ page. + +################################################################################ +# Installing From Binary Packages +# -------------------------------- +# You may install convenient third-party binary package distributions to +# quickly try things out. TLCPack is a thirdparty volunteer community that +# builds binary packages from TVM source. It offers support matrix with +# instructions to install on different platforms, with different features. +# Checkout `TLCPack `_ to learn more. Note that the +# thirdparty binary packages could contain additional licensing terms for +# the hardware drivers that are bundled with it. diff --git a/gallery/user_tutorials/introduction.py b/gallery/user_tutorials/introduction.py new file mode 100644 index 000000000000..0746c3983b61 --- /dev/null +++ b/gallery/user_tutorials/introduction.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Introduction +============ +**Authors**: +`Jocelyn Shiue `_, +`Chris Hoge `_, +`Lianmin Zheng `_ + +Apache TVM is an open source machine learning compiler framework for CPUs, +GPUs, and machine learning accelerators. It aims to enable machine learning +engineers to optimize and run computations efficiently on any hardware backend. +The purpose of this tutorial is to take a guided tour through all of the major +features of TVM by defining and demonstrating key concepts. A new user should +be able to work through the tutorial from start to finish and be able to +operate TVM for automatic model optimization, while having a basic +understanding of the TVM architecture and how it works. + +Contents +-------- + +#. :doc:`Introduction ` +#. :doc:`Installing TVM ` +#. :doc:`Compiling and Optimizing a Model with the Command Line Interface ` +#. :doc:`Compiling and Optimizing a Model with the Python Interface ` +#. :doc:`Working with Operators Using Tensor Expression ` +#. :doc:`Optimizing Operators with Templates and AutoTVM ` +#. :doc:`Optimizing Operators with Template-free AutoScheduler ` +#. :doc:`Cross Compilation and Remote Procedure Calls (RPC) ` +#. :doc:`Compiling Deep Learning Models for GPUs ` +""" + +################################################################################ +# An Overview of TVM and Model Optimization +# ========================================= +# +# The diagram below illustrates the steps a machine model takes as it is +# transformed with the TVM optimizing compiler framework. +# +# .. image:: https://raw.githubusercontent.com/apache/tvm-site/main/images/tutorial/overview.png +# :width: 100% +# :alt: A High Level View of TVM +# +# 1. Import the model from a framework like *Tensorflow*, *Pytorch*, or *Onnx*. +# The importer layer is where TVM can ingest models from other frameworks, like +# Tensorflow, PyTorch, or ONNX. The level of support that TVM offers for each +# frontend varies as we are constantly improving the open source project. If +# you're having issues importing your model into TVM, you may want to try +# converting it to ONNX. +# +# 2. Translate to *Relay*, TVM's high-level model language. +# A model that has been imported into TVM is represented in Relay. Relay is a +# functional language and intermediate representation (IR) for neural networks. +# It has support for: +# +# - Traditional data flow-style representations +# - Functional-style scoping, let-binding which makes it a fully featured +# differentiable language +# - Ability to allow the user to mix the two programming styles +# +# Relay applies graph-level optimization passes to optimize the model. +# +# 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a +# higher-level representation is transformed into a lower-level +# representation. After applying the high-level optimizations, Relay +# runs FuseOps pass to partition the model into many small subgraphs and lowers +# the subgraphs to TE representation. Tensor Expression (TE) is a +# domain-specific language for describing tensor computations. +# TE also provides several *schedule* primitives to specify low-level loop +# optimizations, such as tiling, vectorization, parallelization, +# unrolling, and fusion. +# To aid in the process of converting Relay representation into TE representation, +# TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined +# templates of common tensor operators (e.g., conv2d, transpose). +# +# 4. Search for the best schedule using the auto-tuning module *AutoTVM* or *AutoScheduler*. +# A schedule specifies the low-level loop optimizations for an operator or +# subgraph defined in TE. Auto-tuning modules search for the best schedule +# and compare them with cost models and on-device measurements. +# There are two auto-tuning modules in TVM. +# +# - **AutoTVM**: A template-based auto-tuning module. It runs search algorithms +# to find the best values for the tunable knobs in a user-defined template. +# For common operators, their templates are already provided in TOPI. +# - **AutoScheduler (a.k.a. Ansor)**: A template-free auto-tuning module. +# It does not require pre-defined schedule templates. Instead, it generates +# the search space automatically by analyzing the computation definition. +# It then searches for the best schedule in the generated search space. +# +# 5. Choose the optimal configurations for model compilation. After tuning, the +# auto-tuning module generates tuning records in JSON format. This step +# picks the best schedule for each subgraph. +# +# 6. Lower to Tensor Intermediate Representation (TIR), TVM's low-level +# intermediate representation. After selecting the optimal configurations +# based on the tuning step, each TE subgraph is lowered to TIR and be +# optimized by low-level optimization passes. Next, the optimized TIR is +# lowered to the target compiler of the hardware platform. +# This is the final code generation phase to produce an optimized model +# that can be deployed into production. TVM supports several different +# compiler backends including: +# +# - LLVM, which can target arbitrary microprocessor architecture including +# standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any +# other platform supported by LLVM. +# - Specialized compilers, such as NVCC, NVIDIA's compiler. +# - Embedded and specialized targets, which are implemented through TVM's +# Bring Your Own Codegen (BYOC) framework. +# +# 7. Compile down to machine code. At the end of this process, the +# compiler-specific generated code can be lowered to machine code. +# +# TVM can compile models down to a linkable object module, which can then be +# run with a lightweight TVM runtime that provides C APIs to dynamically +# load the model, and entry points for other languages such as Python and +# Rust. TVM can also build a bundled deployment in which the runtime is +# combined with the model in a single package. +# +# The remainder of the tutorial will cover these aspects of TVM in more detail. diff --git a/gallery/user_tutorials/relay_quick_start.py b/gallery/user_tutorials/relay_quick_start.py new file mode 100644 index 000000000000..fd7f5aa9d756 --- /dev/null +++ b/gallery/user_tutorials/relay_quick_start.py @@ -0,0 +1,155 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +.. _tutorial-relay-quick-start: + +Quick Start Tutorial for Compiling Deep Learning Models +======================================================= +**Author**: `Yao Wang `_, `Truman Tian `_ + +This example shows how to build a neural network with Relay python frontend and +generates a runtime library for Nvidia GPU with TVM. +Notice that you need to build TVM with cuda and llvm enabled. +""" + +###################################################################### +# Overview for Supported Hardware Backend of TVM +# ---------------------------------------------- +# The image below shows hardware backend currently supported by TVM: +# +# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tvm_support_list.png +# :align: center +# +# In this tutorial, we'll choose cuda and llvm as target backends. +# To begin with, let's import Relay and TVM. + +import numpy as np + +from tvm import relay +from tvm.relay import testing +import tvm +from tvm import te +from tvm.contrib import graph_executor +import tvm.testing + +###################################################################### +# Define Neural Network in Relay +# ------------------------------ +# First, let's define a neural network with relay python frontend. +# For simplicity, we'll use pre-defined resnet-18 network in Relay. +# Parameters are initialized with Xavier initializer. +# Relay also supports other model formats such as MXNet, CoreML, ONNX and +# Tensorflow. +# +# In this tutorial, we assume we will do inference on our device and +# the batch size is set to be 1. Input images are RGB color images of +# size 224 * 224. We can call the +# :py:meth:`tvm.relay.expr.TupleWrapper.astext()` to show the network +# structure. + +batch_size = 1 +num_class = 1000 +image_shape = (3, 224, 224) +data_shape = (batch_size,) + image_shape +out_shape = (batch_size, num_class) + +mod, params = relay.testing.resnet.get_workload( + num_layers=18, batch_size=batch_size, image_shape=image_shape +) + +# set show_meta_data=True if you want to show meta data +print(mod.astext(show_meta_data=False)) + +###################################################################### +# Compilation +# ----------- +# Next step is to compile the model using the Relay/TVM pipeline. +# Users can specify the optimization level of the compilation. +# Currently this value can be 0 to 3. The optimization passes include +# operator fusion, pre-computation, layout transformation and so on. +# +# :py:func:`relay.build` returns three components: the execution graph in +# json format, the TVM module library of compiled functions specifically +# for this graph on the target hardware, and the parameter blobs of +# the model. During the compilation, Relay does the graph-level +# optimization while TVM does the tensor-level optimization, resulting +# in an optimized runtime module for model serving. +# +# We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build` +# first does a number of graph-level optimizations, e.g. pruning, fusing, etc., +# then registers the operators (i.e. the nodes of the optimized graphs) to +# TVM implementations to generate a `tvm.module`. +# To generate the module library, TVM will first transfer the high level IR +# into the lower intrinsic IR of the specified target backend, which is CUDA +# in this example. Then the machine code will be generated as the module library. + +opt_level = 3 +target = tvm.target.cuda() +with tvm.transform.PassContext(opt_level=opt_level): + lib = relay.build(mod, target, params=params) + +##################################################################### +# Run the generate library +# ------------------------ +# Now we can create graph executor and run the module on Nvidia GPU. + +# create random input +dev = tvm.cuda() +data = np.random.uniform(-1, 1, size=data_shape).astype("float32") +# create module +module = graph_executor.GraphModule(lib["default"](dev)) +# set input and parameters +module.set_input("data", data) +# run +module.run() +# get output +out = module.get_output(0, tvm.nd.empty(out_shape)).numpy() + +# Print first 10 elements of output +print(out.flatten()[0:10]) + +###################################################################### +# Save and Load Compiled Module +# ----------------------------- +# We can also save the graph, lib and parameters into files and load them +# back in deploy environment. + +#################################################### + +# save the graph, lib and params into separate files +from tvm.contrib import utils + +temp = utils.tempdir() +path_lib = temp.relpath("deploy_lib.tar") +lib.export_library(path_lib) +print(temp.listdir()) + +#################################################### + +# load the module back. +loaded_lib = tvm.runtime.load_module(path_lib) +input_data = tvm.nd.array(data) + +module = graph_executor.GraphModule(loaded_lib["default"](dev)) +module.run(data=input_data) +out_deploy = module.get_output(0).numpy() + +# Print first 10 elements of output +print(out_deploy.flatten()[0:10]) + +# check whether the output from deployed module is consistent with original one +tvm.testing.assert_allclose(out_deploy, out, atol=1e-5) diff --git a/gallery/user_tutorials/tensor_expr_get_started.py b/gallery/user_tutorials/tensor_expr_get_started.py new file mode 100644 index 000000000000..310d6bdbfee4 --- /dev/null +++ b/gallery/user_tutorials/tensor_expr_get_started.py @@ -0,0 +1,903 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +.. _tutorial-tensor-expr-get-started: + +Working with Operators Using Tensor Expression +============================================== +**Author**: `Tianqi Chen `_ + +In this tutorial we will turn our attention to how TVM works with Tensor +Expression (TE) to define tensor computations and apply loop optimizations. TE +describes tensor computations in a pure functional language (that is each +expression has no side effects). When viewed in context of the TVM as a whole, +Relay describes a computation as a set of operators, and each of these +operators can be represented as a TE expression where each TE expression takes +input tensors and produces an output tensor. + +This is an introductory tutorial to the Tensor Expression language in TVM. TVM +uses a domain specific tensor expression for efficient kernel construction. We +will demonstrate the basic workflow with two examples of using the tensor expression +language. The first example introduces TE and scheduling with vector +addition. The second expands on these concepts with a step-by-step optimization +of a matrix multiplication with TE. This matrix multiplication example will +serve as the comparative basis for future tutorials covering more advanced +features of TVM. +""" + +################################################################################ +# Example 1: Writing and Scheduling Vector Addition in TE for CPU +# --------------------------------------------------------------- +# +# Let's look at an example in Python in which we will implement a TE for +# vector addition, followed by a schedule targeted towards a CPU. +# We begin by initializing a TVM environment. + +import tvm +import tvm.testing +from tvm import te +import numpy as np + +# You will get better performance if you can identify the CPU you are targeting +# and specify it. If you're using llvm, you can get this information from the +# command ``llc --version`` to get the CPU type, and you can check +# ``/proc/cpuinfo`` for additional extensions that your processor might +# support. For example, you can use "llvm -mcpu=skylake-avx512" for CPUs with +# AVX-512 instructions. + +tgt = tvm.target.Target(target="llvm", host="llvm") + +################################################################################ +# Describing the Vector Computation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# We describe a vector addition computation. TVM adopts tensor semantics, with +# each intermediate result represented as a multi-dimensional array. The user +# needs to describe the computation rule that generates the tensors. We first +# define a symbolic variable ``n`` to represent the shape. We then define two +# placeholder Tensors, ``A`` and ``B``, with given shape ``(n,)``. We then +# describe the result tensor ``C``, with a ``compute`` operation. The +# ``compute`` defines a computation, with the output conforming to the +# specified tensor shape and the computation to be performed at each position +# in the tensor defined by the lambda function. Note that while ``n`` is a +# variable, it defines a consistent shape between the ``A``, ``B`` and ``C`` +# tensors. Remember, no actual computation happens during this phase, as we +# are only declaring how the computation should be done. + +n = te.var("n") +A = te.placeholder((n,), name="A") +B = te.placeholder((n,), name="B") +C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") + +################################################################################ +# .. note:: Lambda Functions +# +# The second argument to the ``te.compute`` method is the function that +# performs the computation. In this example, we're using an anonymous function, +# also known as a ``lambda`` function, to define the computation, in this case +# addition on the ``i``th element of ``A`` and ``B``. + +################################################################################ +# Create a Default Schedule for the Computation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# While the above lines describe the computation rule, we can compute ``C`` in +# many different ways to fit different devices. For a tensor with multiple +# axes, you can choose which axis to iterate over first, or computations can be +# split across different threads. TVM requires that the user to provide a +# schedule, which is a description of how the computation should be performed. +# Scheduling operations within TE can change loop orders, split computations +# across different threads, group blocks of data together, amongst other +# operations. An important concept behind schedules is that they only describe +# how the computation is performed, so different schedules for the same TE will +# produce the same result. +# +# TVM allows you to create a naive schedule that will compute ``C`` in by +# iterating in row major order. +# +# .. code-block:: c +# +# for (int i = 0; i < n; ++i) { +# C[i] = A[i] + B[i]; +# } + +s = te.create_schedule(C.op) + +###################################################################### +# Compile and Evaluate the Default Schedule +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# With the TE expression and a schedule, we can produce runnable code for our +# target language and architecture, in this case LLVM and a CPU. We provide +# TVM with the schedule, a list of the TE expressions that are in the schedule, +# the target and host, and the name of the function we are producing. The result +# of the output is a type-erased function that can be called directly from Python. +# +# In the following line, we use tvm.build to create a function. The build +# function takes the schedule, the desired signature of the function (including +# the inputs and outputs) as well as target language we want to compile to. + +fadd = tvm.build(s, [A, B, C], tgt, name="myadd") + +################################################################################ +# Let's run the function, and compare the output to the same computation in +# numpy. The compiled TVM function is exposes a concise C API that can be invoked +# from any language. We begin by creating a device, which is a device (CPU in this +# example) that TVM can compile the schedule to. In this case the device is an +# LLVM CPU target. We can then initialize the tensors in our device and +# perform the custom addition operation. To verify that the computation is +# correct, we can compare the result of the output of the c tensor to the same +# computation performed by numpy. + +dev = tvm.device(tgt.kind.name, 0) + +n = 1024 +a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) +b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) +c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) +fadd(a, b, c) +tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + +################################################################################ +# To get a comparison of how fast this version is compared to numpy, create a +# helper function to run a profile of the TVM generated code. +import timeit + +np_repeat = 100 +np_running_time = timeit.timeit( + setup="import numpy\n" + "n = 32768\n" + 'dtype = "float32"\n' + "a = numpy.random.rand(n, 1).astype(dtype)\n" + "b = numpy.random.rand(n, 1).astype(dtype)\n", + stmt="answer = a + b", + number=np_repeat, +) +print("Numpy running time: %f" % (np_running_time / np_repeat)) + + +def evaluate_addition(func, target, optimization, log): + dev = tvm.device(target.kind.name, 0) + n = 32768 + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + + evaluator = func.time_evaluator(func.entry_name, dev, number=10) + mean_time = evaluator(a, b, c).mean + print("%s: %f" % (optimization, mean_time)) + + log.append((optimization, mean_time)) + + +log = [("numpy", np_running_time / np_repeat)] +evaluate_addition(fadd, tgt, "naive", log=log) + +################################################################################ +# Updating the Schedule to Use Paralleism +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Now that we've illustrated the fundamentals of TE, let's go deeper into what +# schedules do, and how they can be used to optimize tensor expressions for +# different architectures. A schedule is a series of steps that are applied to +# an expression to transform it in a number of different ways. When a schedule +# is applied to an expression in TE, the inputs and outputs remain the same, +# but when compiled the implementation of the expression can change. This +# tensor addition, in the default schedule, is run serially but is easy to +# parallelize across all of the processor threads. We can apply the parallel +# schedule operation to our computation. + +s[C].parallel(C.op.axis[0]) + +################################################################################ +# The ``tvm.lower`` command will generate the Intermediate Representation (IR) +# of the TE, with the corresponding schedule. By lowering the expression as we +# apply different schedule operations, we can see the effect of scheduling on +# the ordering of the computation. We use the flag ``simple_mode=True`` to +# return a readable C-style statement. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# It's now possible for TVM to run these blocks on independent threads. Let's +# compile and run this new schedule with the parallel operation applied: + +fadd_parallel = tvm.build(s, [A, B, C], tgt, name="myadd_parallel") +fadd_parallel(a, b, c) + +tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + +evaluate_addition(fadd_parallel, tgt, "parallel", log=log) + +################################################################################ +# Updating the Schedule to Use Vectorization +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Modern CPUs also have the ability to perform SIMD operations on floating +# point values, and we can apply another schedule to our computation expression +# to take advantage of this. Accomplishing this requires multiple steps: first +# we have to split the schedule into inner and outer loops using the split +# scheduling primitive. The inner loops can use vectorization to use SIMD +# instructions using the vectorize scheduling primitive, then the outer loops +# can be parallelized using the parallel scheduling primitive. Choose the split +# factor to be the number of threads on your CPU. + +# Recreate the schedule, since we modified it with the parallel operation in +# the previous example +n = te.var("n") +A = te.placeholder((n,), name="A") +B = te.placeholder((n,), name="B") +C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") + +s = te.create_schedule(C.op) + +# This factor should be chosen to match the number of threads appropriate for +# your CPU. This will vary depending on architecture, but a good rule is +# setting this factor to equal the number of available CPU cores. +factor = 4 + +outer, inner = s[C].split(C.op.axis[0], factor=factor) +s[C].parallel(outer) +s[C].vectorize(inner) + +fadd_vector = tvm.build(s, [A, B, C], tgt, name="myadd_parallel") + +evaluate_addition(fadd_vector, tgt, "vector", log=log) + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Comparing the Diferent Schedules +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# We can now compare the different schedules + +baseline = log[0][1] +print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20))) +for result in log: + print( + "%s\t%s\t%s" + % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20)) + ) + + +################################################################################ +# .. note:: Code Specialization +# +# As you may have noticed, the declarations of ``A``, ``B`` and ``C`` all +# take the same shape argument, ``n``. TVM will take advantage of this to +# pass only a single shape argument to the kernel, as you will find in the +# printed device code. This is one form of specialization. +# +# On the host side, TVM will automatically generate check code that checks +# the constraints in the parameters. So if you pass arrays with different +# shapes into fadd, an error will be raised. +# +# We can do more specializations. For example, we can write :code:`n = +# tvm.runtime.convert(1024)` instead of :code:`n = te.var("n")`, in the +# computation declaration. The generated function will only take vectors with +# length 1024. + +################################################################################ +# We've defined, scheduled, and compiled a vector addition operator, which we +# were then able to execute on the TVM runtime. We can save the operator as a +# library, which we can then load later using the TVM runtime. + +################################################################################ +# Targeting Vector Addition for GPUs (Optional) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# TVM is capable of targeting multiple architectures. In the next example, we +# will target compilation of the vector addition to GPUs. + +# If you want to run this code, change ``run_cuda = True`` +# Note that by default this example is not run in the docs CI. + +run_cuda = False +if run_cuda: + # Change this target to the correct backend for you gpu. For example: cuda (NVIDIA GPUs), + # rocm (Radeon GPUS), OpenCL (opencl). + tgt_gpu = tvm.target.Target(target="cuda", host="llvm") + + # Recreate the schedule + n = te.var("n") + A = te.placeholder((n,), name="A") + B = te.placeholder((n,), name="B") + C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") + print(type(C)) + + s = te.create_schedule(C.op) + + bx, tx = s[C].split(C.op.axis[0], factor=64) + + ################################################################################ + # Finally we must bind the iteration axis bx and tx to threads in the GPU + # compute grid. The naive schedule is not valid for GPUs, and these are + # specific constructs that allow us to generate code that runs on a GPU. + + s[C].bind(bx, te.thread_axis("blockIdx.x")) + s[C].bind(tx, te.thread_axis("threadIdx.x")) + + ###################################################################### + # Compilation + # ----------- + # After we have finished specifying the schedule, we can compile it + # into a TVM function. By default TVM compiles into a type-erased + # function that can be directly called from the python side. + # + # In the following line, we use tvm.build to create a function. + # The build function takes the schedule, the desired signature of the + # function (including the inputs and outputs) as well as target language + # we want to compile to. + # + # The result of compilation fadd is a GPU device function (if GPU is + # involved) as well as a host wrapper that calls into the GPU + # function. fadd is the generated host wrapper function, it contains + # a reference to the generated device function internally. + + fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name="myadd") + + ################################################################################ + # The compiled TVM function is exposes a concise C API that can be invoked from + # any language. + # + # We provide a minimal array API in python to aid quick testing and prototyping. + # The array API is based on the `DLPack `_ standard. + # + # - We first create a GPU device. + # - Then tvm.nd.array copies the data to the GPU. + # - ``fadd`` runs the actual computation + # - ``numpy()`` copies the GPU array back to the CPU (so we can verify correctness). + # + # Note that copying the data to and from the memory on the GPU is a required step. + + dev = tvm.device(tgt_gpu.kind.name, 0) + + n = 1024 + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + fadd(a, b, c) + tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + + ################################################################################ + # Inspect the Generated GPU Code + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # You can inspect the generated code in TVM. The result of tvm.build is a TVM + # Module. fadd is the host module that contains the host wrapper, it also + # contains a device module for the CUDA (GPU) function. + # + # The following code fetches the device module and prints the content code. + + if ( + tgt_gpu.kind.name == "cuda" + or tgt_gpu.kind.name == "rocm" + or tgt_gpu.kind.name.startswith("opencl") + ): + dev_module = fadd.imported_modules[0] + print("-----GPU code-----") + print(dev_module.get_source()) + else: + print(fadd.get_source()) + +################################################################################ +# Saving and Loading Compiled Modules +# ----------------------------------- +# Besides runtime compilation, we can save the compiled modules into a file and +# load them back later. +# +# The following code first performs the following steps: +# +# - It saves the compiled host module into an object file. +# - Then it saves the device module into a ptx file. +# - cc.create_shared calls a compiler (gcc) to create a shared library + +from tvm.contrib import cc +from tvm.contrib import utils + +temp = utils.tempdir() +fadd.save(temp.relpath("myadd.o")) +if tgt.kind.name == "cuda": + fadd.imported_modules[0].save(temp.relpath("myadd.ptx")) +if tgt.kind.name == "rocm": + fadd.imported_modules[0].save(temp.relpath("myadd.hsaco")) +if tgt.kind.name.startswith("opencl"): + fadd.imported_modules[0].save(temp.relpath("myadd.cl")) +cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")]) +print(temp.listdir()) + +################################################################################ +# .. note:: Module Storage Format +# +# The CPU (host) module is directly saved as a shared library (.so). There +# can be multiple customized formats of the device code. In our example, the +# device code is stored in ptx, as well as a meta data json file. They can be +# loaded and linked separately via import. + +################################################################################ +# Load Compiled Module +# ~~~~~~~~~~~~~~~~~~~~ +# We can load the compiled module from the file system and run the code. The +# following code loads the host and device module separately and links them +# together. We can verify that the newly loaded function works. + +fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so")) +if tgt.kind.name == "cuda": + fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx")) + fadd1.import_module(fadd1_dev) + +if tgt.kind.name == "rocm": + fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco")) + fadd1.import_module(fadd1_dev) + +if tgt.kind.name.startswith("opencl"): + fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl")) + fadd1.import_module(fadd1_dev) + +fadd1(a, b, c) +tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + +################################################################################ +# Pack Everything into One Library +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# In the above example, we store the device and host code separately. TVM also +# supports export everything as one shared library. Under the hood, we pack +# the device modules into binary blobs and link them together with the host +# code. Currently we support packing of Metal, OpenCL and CUDA modules. + +fadd.export_library(temp.relpath("myadd_pack.so")) +fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so")) +fadd2(a, b, c) +tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + +################################################################################ +# .. note:: Runtime API and Thread-Safety +# +# The compiled modules of TVM do not depend on the TVM compiler. Instead, +# they only depend on a minimum runtime library. The TVM runtime library +# wraps the device drivers and provides thread-safe and device agnostic calls +# into the compiled functions. +# +# This means that you can call the compiled TVM functions from any thread, on +# any GPUs, provided that you have compiled the code for that GPU. + +################################################################################ +# Generate OpenCL Code +# -------------------- +# TVM provides code generation features into multiple backends. We can also +# generate OpenCL code or LLVM code that runs on CPU backends. +# +# The following code blocks generate OpenCL code, creates array on an OpenCL +# device, and verifies the correctness of the code. + +if tgt.kind.name.startswith("opencl"): + fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd") + print("------opencl code------") + print(fadd_cl.imported_modules[0].get_source()) + dev = tvm.cl(0) + n = 1024 + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + fadd_cl(a, b, c) + tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + +################################################################################ +# .. note:: TE Scheduling Primitives +# +# TVM includes a number of different scheduling primitives: +# +# - split: splits a specified axis into two axises by the defined factor. +# - tile: tiles will split a computation across two axes by the defined factors. +# - fuse: fuses two consecutive axises of one computation. +# - reorder: can reorder the axises of a computation into a defined order. +# - bind: can bind a computation to a specific thread, useful in GPU programming. +# - compute_at: by default, TVM will compute tensors at the outermost level +# of the function, or the root, by default. compute_at specifies that one +# tensor should be computed at the first axis of computation for another +# operator. +# - compute_inline: when marked inline, a computation will be expanded then +# inserted into the address where the tensor is required. +# - compute_root: moves a computation to the outermost layer, or root, of the +# function. This means that stage of the computation will be fully computed +# before it moves on to the next stage. +# +# A complete description of these primitives can be found in the +# [Schedule Primitives](https://tvm.apache.org/docs/tutorials/language/schedule_primitives.html) docs page. + +################################################################################ +# Example 2: Manually Optimizing Matrix Multiplication with TE +# ------------------------------------------------------------ +# +# Now we will consider a second, more advanced example, demonstrating how with +# just 18 lines of python code TVM speeds up a common matrix multiplication operation by 18x. +# +# **Matrix multiplication is a compute intensive operation. There are +# two important optimizations for good CPU performance:** +# +# 1. Increase the cache hit rate of memory access. Both complex +# numerical computation and hot-spot memory access can be +# accelerated by a high cache hit rate. This requires us to +# transform the origin memory access pattern to a pattern that fits +# the cache policy. +# +# 2. SIMD (Single instruction multi-data), also known as the vector +# processing unit. On each cycle instead of processing a single +# value, SIMD can process a small batch of data. This requires us +# to transform the data access pattern in the loop body in uniform +# pattern so that the LLVM backend can lower it to SIMD. +# +# The techniques used in this tutorial are a subset of tricks mentioned in this +# `repository `_. Some of them +# have been applied by TVM abstraction automatically, but some of them cannot +# be automatically applied due to TVM constraints. + +################################################################################ +# Preparation and Performance Baseline +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We begin by collecting performance data on the `numpy` implementation of +# matrix multiplication. + +import tvm +import tvm.testing +from tvm import te +import numpy + +# The size of the matrix +# (M, K) x (K, N) +# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL. +M = 1024 +K = 1024 +N = 1024 + +# The default tensor data type in tvm +dtype = "float32" + +# You will want to adjust the target to match any CPU vector extensions you +# might have. For example, if you're using using Intel AVX2 (Advanced Vector +# Extensions) ISA for SIMD, you can get the best performance by changing the +# following line to ``llvm -mcpu=core-avx2``, or specific type of CPU you use. +# Recall that you're using llvm, you can get this information from the command +# ``llc --version`` to get the CPU type, and you can check ``/proc/cpuinfo`` +# for additional extensions that your processor might support. + +target = tvm.target.Target(target="llvm", host="llvm") +dev = tvm.device(target.kind.name, 0) + +# Random generated tensor for testing +a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev) +b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev) + +# Repeatedly perform a matrix multiplication to get a performance baseline +# for the default numpy implementation +np_repeat = 100 +np_running_time = timeit.timeit( + setup="import numpy\n" + "M = " + str(M) + "\n" + "K = " + str(K) + "\n" + "N = " + str(N) + "\n" + 'dtype = "float32"\n' + "a = numpy.random.rand(M, K).astype(dtype)\n" + "b = numpy.random.rand(K, N).astype(dtype)\n", + stmt="answer = numpy.dot(a, b)", + number=np_repeat, +) +print("Numpy running time: %f" % (np_running_time / np_repeat)) + +answer = numpy.dot(a.numpy(), b.numpy()) + +################################################################################ +# Now we write a basic matrix multiplication using TVM TE and verify that it +# produces the same results as the numpy implementation. We also write a +# function that will help us measure the performance of the schedule +# optimizations. + +# TVM Matrix Multiplication using TE +k = te.reduce_axis((0, K), "k") +A = te.placeholder((M, K), name="A") +B = te.placeholder((K, N), name="B") +C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C") + +# Default schedule +s = te.create_schedule(C.op) +func = tvm.build(s, [A, B, C], target=target, name="mmult") + +c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) +func(a, b, c) +tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) + + +def evaluate_operation(s, vars, target, name, optimization, log): + func = tvm.build(s, [A, B, C], target=target, name="mmult") + assert func + + c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) + func(a, b, c) + tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) + + evaluator = func.time_evaluator(func.entry_name, dev, number=10) + mean_time = evaluator(a, b, c).mean + print("%s: %f" % (optimization, mean_time)) + log.append((optimization, mean_time)) + + +log = [] + +evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="none", log=log) + +################################################################################ +# Let's take a look at the intermediate representation of the operator and +# default schedule using the TVM lower function. Note how the implementation is +# essentially a naive implementation of a matrix multiplication, using three +# nested loops over the indices of the A and B matrices. + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Optimization 1: Blocking +# ~~~~~~~~~~~~~~~~~~~~~~~~ +# +# A important trick to enhance the cache hit rate is blocking, where you +# structure memory access such that the inside a block is a small neighborhood +# that has high memory locality. In this tutorial, we pick a block factor of +# 32. This will result in a block that will fill a 32 * 32 * sizeof(float) area +# of memory. This corresponds to a cache size of 4KB, in relation to a +# reference cache size of 32 KB for L1 cache. +# +# We begin by creating a default schedule for the ``C`` operation, then apply a +# ``tile`` scheduling primitive to it with the specified block factor, with the +# scheduling primitive returning the resulting loop order from outermost to +# innermost, as a vector ``[x_outer, y_outer, x_inner, y_inner]``. We then get +# the reduction axis for output of the operation, and perform a split operation +# on it using a factor of 4. This factor doesn't directly impact the blocking +# optimization we're working on right now, but will be useful later when we +# apply vectorization. +# +# Now that the operation has been blocked, we can reorder the computation to +# put the reduction operation into the outermost loop of the computation, +# helping to guarantee that the blocked data remains in cache. This completes +# the schedule, and we can build and test the performance compared to the naive +# schedule. + +bn = 32 + +# Blocking by loop tiling +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +(k,) = s[C].op.reduce_axis +ko, ki = s[C].split(k, factor=4) + +# Hoist reduction domain outside the blocking loop +s[C].reorder(xo, yo, ko, ki, xi, yi) + +evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="blocking", log=log) + +################################################################################ +# By reordering the computation to take advantage of caching, you should see a +# significant improvement in the performance of the computation. Now, print the +# internal representation and compare it to the original: + +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Optimization 2: Vectorization +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Another important optimization trick is vectorization. When the memory access +# pattern is uniform, the compiler can detect this pattern and pass the +# continuous memory to the SIMD vector processor. In TVM, we can use the +# ``vectorize`` interface to hint the compiler this pattern, taking advantage +# of this hardware feature. +# +# In this tutorial, we chose to vectorize the inner loop row data since it is +# already cache friendly from our previous optimizations. + +# Apply the vectorization optimization +s[C].vectorize(yi) + +evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="vectorization", log=log) + +# The generalized IR after vectorization +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Optimization 3: Loop Permutation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# If we look at the above IR, we can see the inner loop row data is vectorized +# and B is transformed into PackedB (this is evident by the `(float32x32*)B2` +# portion of the inner loop). The traversal of PackedB is sequential now. So we +# will look at the access pattern of A. In current schedule, A is accessed +# column by column which is not cache friendly. If we change the nested loop +# order of `ki` and inner axes `xi`, the access pattern for A matrix will be +# more cache friendly. + +s = te.create_schedule(C.op) +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +(k,) = s[C].op.reduce_axis +ko, ki = s[C].split(k, factor=4) + +# re-ordering +s[C].reorder(xo, yo, ko, xi, ki, yi) +s[C].vectorize(yi) + +evaluate_operation( + s, [A, B, C], target=target, name="mmult", optimization="loop permutation", log=log +) + +# Again, print the new generalized IR +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Optimization 4: Array Packing +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Another important trick is array packing. This trick is to reorder the +# storage dimension of the array to convert the continuous access pattern on +# certain dimension to a sequential pattern after flattening. +# +# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png +# :align: center +# +# Just as it is shown in the figure above, after blocking the computations, we +# can observe the array access pattern of B (after flattening), which is +# regular but discontinuous. We expect that after some transformation we can +# get a continuous access pattern. By reordering a ``[16][16]`` array to a +# ``[16/4][16][4]`` array the access pattern of B will be sequential when +# grabing the corresponding value from the packed array. +# +# To accomplish this, we are going to have to start with a new default +# schedule, taking into account the new packing of B. It's worth taking a +# moment to comment on this: TE is a powerful and expressive language for +# writing optimized operators, but it often requires some knowledge of the +# underlying algorithm, data structures, and hardware target that you are +# writing for. Later in the tutorial, we will discuss some of the options for +# letting TVM take that burden. Regardless, let's move on with the new +# optimized schedule. + +# We have to re-write the algorithm slightly. +packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB") +C = te.compute( + (M, N), + lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k), + name="C", +) + +s = te.create_schedule(C.op) + +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +(k,) = s[C].op.reduce_axis +ko, ki = s[C].split(k, factor=4) + +s[C].reorder(xo, yo, ko, xi, ki, yi) +s[C].vectorize(yi) + +x, y, z = s[packedB].op.axis +s[packedB].vectorize(z) +s[packedB].parallel(x) + +evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="array packing", log=log) + +# Here is the generated IR after array packing. +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Optimization 5: Optimizing Block Writing Through Caching +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Up to this point all of our optimizations have focused on efficiently +# accessing and computing the data from the `A` and `B` matrices to compute the +# `C` matrix. After the blocking optimization, the operator will write result +# to `C` block by block, and the access pattern is not sequential. We can +# address this by using a sequential cache array, using a combination of +# `cache_write`, `compute_at`, and `unroll`to hold the block results and write +# to `C` when all the block results are ready. + +s = te.create_schedule(C.op) + +# Allocate write cache +CC = s.cache_write(C, "global") + +xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) + +# Write cache is computed at yo +s[CC].compute_at(s[C], yo) + +# New inner axes +xc, yc = s[CC].op.axis + +(k,) = s[CC].op.reduce_axis +ko, ki = s[CC].split(k, factor=4) +s[CC].reorder(ko, xc, ki, yc) +s[CC].unroll(ki) +s[CC].vectorize(yc) + +x, y, z = s[packedB].op.axis +s[packedB].vectorize(z) +s[packedB].parallel(x) + +evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="block caching", log=log) + +# Here is the generated IR after write cache blocking. +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Optimization 6: Parallelization +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# So far, our computation is only designed to use a single core. Nearly all +# modern processors have multiple cores, and computation can benefit from +# running computations in parallel. The final optimization is to take advantage +# of thread-level parallelization. + +# parallel +s[C].parallel(xo) + +x, y, z = s[packedB].op.axis +s[packedB].vectorize(z) +s[packedB].parallel(x) + +evaluate_operation( + s, [A, B, C], target=target, name="mmult", optimization="parallelization", log=log +) + +# Here is the generated IR after parallelization. +print(tvm.lower(s, [A, B, C], simple_mode=True)) + +################################################################################ +# Summary of Matrix Multiplication Example +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# After applying the above simple optimizations with only 18 lines of code, our +# generated code can begin to approach the performance of `numpy` with the Math +# Kernel Library (MKL). Since we've been logging the performance as we've been +# working, we can compare the results. + +baseline = log[0][1] +print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20))) +for result in log: + print( + "%s\t%s\t%s" + % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20)) + ) + +################################################################################ +# Note that the outputs on the web page reflect the running times on a +# non-exclusive Docker container, and should be considered unreliable. It is +# highly encouraged to run the tutorial by yourself to observe the performance +# gain achieved by TVM, and to carefully work through each example to +# understand the iterative improvements that are made to the matrix +# multiplication operation. + +################################################################################ +# Final Notes and Summary +# ----------------------- +# As mentioned earlier, how to apply optimizations using TE and scheduling +# primitives can require some knowledge of the underlying architecture and +# algorithms. However, TE was designed to act as a foundation for more complex +# algorithms that can search the potential optimization. With the knowledge you +# have from this introduction to TE, we can now begin to explore how TVM can +# automate the schedule optimization process. +# +# This tutorial provided a walkthrough of TVM Tensor Expresstion (TE) workflow +# using a vector add and a matrix multiplication examples. The general workflow +# is +# +# - Describe your computation via a series of operations. +# - Describe how we want to compute use schedule primitives. +# - Compile to the target function we want. +# - Optionally, save the function to be loaded later. +# +# Upcoming tutorials expand on the matrix multiplication example, and show how +# you can build generic templates of the matrix multiplication and other +# operations with tunable parameters that allows you to automatically optimize +# the computation for specific platforms. diff --git a/gallery/user_tutorials/tvmc_command_line_driver.py b/gallery/user_tutorials/tvmc_command_line_driver.py new file mode 100644 index 000000000000..c729b86a3245 --- /dev/null +++ b/gallery/user_tutorials/tvmc_command_line_driver.py @@ -0,0 +1,511 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Compiling and Optimizing a Model with TVMC +========================================== +**Authors**: +`Leandro Nunes `_, +`Matthew Barrett `_, +`Chris Hoge `_ + +In this section, we will work with TVMC, the TVM command line driver. TVMC is a +tool that exposes TVM features such as auto-tuning, compiling, profiling and +execution of models through a command line interface. + +Upon completion of this section, we will have used TVMC to accomplish the +following tasks: + +* Compile a pre-trained ResNet 50 v2 model for the TVM runtime. +* Run a real image through the compiled model, and interpret the output and + model performance. +* Tune the model on a CPU using TVM. +* Re-compile an optimized model using the tuning data collected by TVM. +* Run the image through the optimized model, and compare the output and model + performance. + +The goal of this section is to give you an overview of TVM and TVMC's +capabilities, and set the stage for understanding how TVM works. +""" + +################################################################################ +# Using TVMC +# ---------- +# +# TVMC is a Python application, part of the TVM Python package. +# When you install TVM using a Python package, you will get TVMC as +# as a command line application called ``tvmc``. The location of this command +# will vary depending on your platform and installation method. +# +# Alternatively, if you have TVM as a Python module on your +# ``$PYTHONPATH``,you can access the command line driver functionality +# via the executable python module, ``python -m tvm.driver.tvmc``. +# +# For simplicity, this tutorial will mention TVMC command line using +# ``tvmc ``, but the same results can be obtained with +# ``python -m tvm.driver.tvmc ``. +# +# You can check the help page using: +# +# .. code-block:: bash +# +# tvmc --help +# +# The main features of TVM available to ``tvmc`` are from subcommands +# ``compile``, and ``run``, and ``tune``. To read about specific options under +# a given subcommand, use ``tvmc --help``. We will cover each of +# these commands in this tutorial, but first we need to download a pre-trained +# model to work with. +# + + +################################################################################ +# Obtaining the Model +# ------------------- +# +# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a +# convolutional neural network that is 50-layers deep and designed to classify +# images. The model we will be using has been pre-trained on more than a +# million images with 1000 different classifications. The network has an input +# image size of 224x224. If you are interested exploring more of how the +# ResNet-50 model is structured, we recommend downloading `Netron +# `, a freely available ML model viewer. +# +# For this tutorial we will be using the model in ONNX format. +# +# .. code-block:: bash +# +# wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx +# + + +################################################################################ +# .. note:: Supported model formats +# +# TVMC supports models created with Keras, ONNX, TensorFlow, TFLite +# and Torch. Use the option``--model-format`` if you need to +# explicitly provide the model format you are using. See ``tvmc +# compile --help`` for more information. +# + +################################################################################ +# .. note:: Adding ONNX Support to TVM +# +# TVM relies on the ONNX python library being available on your system. You +# can install ONNX using the command ``pip3 install --user onnx``. You may +# remove the ``--user`` option if you have root access and want to install +# ONNX globally. +# + +################################################################################ +# Compiling an ONNX Model to the TVM Runtime +# ------------------------------------------ +# +# Once we've downloaded the ResNet-50 model, the next step is to compile it. To +# accomplish that, we are going to use ``tvmc compile``. The output we get from +# the compilation process is a TAR package of the model compiled to a dynamic +# library for our target platform. We can run that model on our target device +# using the TVM runtime. +# +# .. code-block:: bash +# +# tvmc compile \ +# --target "llvm" \ +# --output resnet50-v2-7-tvm.tar \ +# resnet50-v2-7.onnx +# +# Let's take a look at the files that ``tvmc compile`` creates in the module: +# +# .. code-block:: bash +# +# mkdir model +# tar -xvf resnet50-v2-7-tvm.tar -C model +# ls model +# +# You will see three files listed. +# +# * ``mod.so`` is the model, represented as a C++ library, that can be loaded +# by the TVM runtime. +# * ``mod.json`` is a text representation of the TVM Relay computation graph. +# * ``mod.params`` is a file containing the parameters for the pre-trained +# model. +# +# This module can be directly loaded by your application, and the model can be +# run via the TVM runtime APIs. + + +################################################################################ +# .. note:: Defining the Correct Target +# +# Specifying the correct target (option ``--target``) can have a huge +# impact on the performance of the compiled module, as it can take +# advantage of hardware features available on the target. For more +# information, please refer to `Auto-tuning a convolutional network +# for x86 CPU `_. +# We recommend identifying which CPU you are running, along with optional features, +# and set the target appropriately. +# + +################################################################################ +# Running the Model from The Compiled Module with TVMC +# ---------------------------------------------------- +# +# Now that we've compiled the model to this module, we can use the TVM runtime +# to make predictions with it. TVMC has the TVM runtime built in to it, +# allowing you to run compiled TVM models. To use TVMC to run the model and +# make predictions, we need two things: +# +# - The compiled module, which we just produced. +# - Valid input to the model to make predictions on. +# +# Each model is particular when it comes to expected tensor shapes, formats and +# data types. For this reason, most models require some pre and +# post-processing, to ensure the input is valid and to interpret the output. +# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This +# is a well-supported NumPy format to serialize multiple arrays into a file +# +# As input for this tutorial, we will use the image of a cat, but you can feel +# free to substitute image for any of your choosing. +# +# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg +# :height: 224px +# :width: 224px +# :align: center + + +################################################################################ +# Input pre-processing +# ~~~~~~~~~~~~~~~~~~~~ +# +# For our ResNet 50 V2 model, the input is expected to be in ImageNet format. +# Here is an example of a script to pre-process an image for ResNet 50 V2. +# +# You will need to have a supported version of the Python Image Library +# installed. You can use ``pip3 install --user pillow`` to satisfy this +# requirement for the script. +# +# .. code-block:: python +# :caption: preprocess.py +# :name: preprocess.py +# +# #!python ./preprocess.py +# from tvm.contrib.download import download_testdata +# from PIL import Image +# import numpy as np +# +# img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" +# img_path = download_testdata(img_url, "imagenet_cat.png", module="data") +# +# # Resize it to 224x224 +# resized_image = Image.open(img_path).resize((224, 224)) +# img_data = np.asarray(resized_image).astype("float32") +# +# # ONNX expects NCHW input, so convert the array +# img_data = np.transpose(img_data, (2, 0, 1)) +# +# # Normalize according to ImageNet +# imagenet_mean = np.array([0.485, 0.456, 0.406]) +# imagenet_stddev = np.array([0.229, 0.224, 0.225]) +# norm_img_data = np.zeros(img_data.shape).astype("float32") +# for i in range(img_data.shape[0]): +# norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i] +# +# # Add batch dimension +# img_data = np.expand_dims(norm_img_data, axis=0) +# +# # Save to .npz (outputs imagenet_cat.npz) +# np.savez("imagenet_cat", data=img_data) +# + +################################################################################ +# Running the Compiled Module +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# With both the model and input data in hand, we can now run TVMC to make a +# prediction: +# +# .. code-block:: bash +# +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# resnet50-v2-7-tvm.tar +# +# Recall that the `.tar` model file includes a C++ library, a description of +# the Relay model, and the parameters for the model. TVMC includes the TVM +# runtime, which can load the model and make predictions against input. When +# running the above command, TVMC outputs a new file, ``predictions.npz``, that +# contains the model output tensors in NumPy format. +# +# In this example, we are running the model on the same machine that we used +# for compilation. In some cases we might want to run it remotely via an RPC +# Tracker. To read more about these options please check ``tvmc run --help``. + +################################################################################ +# Output Post-Processing +# ~~~~~~~~~~~~~~~~~~~~~~ +# +# As previously mentioned, each model will have its own particular way of +# providing output tensors. +# +# In our case, we need to run some post-processing to render the outputs from +# ResNet 50 V2 into a more human-readable form, using the lookup-table provided +# for the model. +# +# The script below shows an example of the post-processing to extract labels +# from the output of our compiled module. +# +# .. code-block:: python +# :caption: postprocess.py +# :name: postprocess.py +# +# #!python ./postprocess.py +# import os.path +# import numpy as np +# +# from scipy.special import softmax +# +# from tvm.contrib.download import download_testdata +# +# # Download a list of labels +# labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" +# labels_path = download_testdata(labels_url, "synset.txt", module="data") +# +# with open(labels_path, "r") as f: +# labels = [l.rstrip() for l in f] +# +# output_file = "predictions.npz" +# +# # Open the output and read the output tensor +# if os.path.exists(output_file): +# with np.load(output_file) as data: +# scores = softmax(data["output_0"]) +# scores = np.squeeze(scores) +# ranks = np.argsort(scores)[::-1] +# +# for rank in ranks[0:5]: +# print("class='%s' with probability=%f" % (labels[rank], scores[rank])) +# +# Running this script should produce the following output: +# +# .. code-block:: bash +# +# python postprocess.py +# +# # class='n02123045 tabby, tabby cat' with probability=0.610553 +# # class='n02123159 tiger cat' with probability=0.367179 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 +# +# Try replacing the cat image with other images, and see what sort of +# predictions the ResNet model makes. + +################################################################################ +# Automatically Tuning the ResNet Model +# ------------------------------------- +# +# The previous model was compiled to work on the TVM runtime, but did not +# include any platform specific optimization. In this section, we will show you +# how to build an optimized model using TVMC to target your working platform. +# +# In some cases, we might not get the expected performance when running +# inferences using our compiled module. In cases like this, we can make use of +# the auto-tuner, to find a better configuration for our model and get a boost +# in performance. Tuning in TVM refers to the process by which a model is +# optimized to run faster on a given target. This differs from training or +# fine-tuning in that it does not affect the accuracy of the model, but only +# the runtime performance. As part of the tuning process, TVM will try running +# many different operator implementation variants to see which perform best. +# The results of these runs are stored in a tuning records file, which is +# ultimately the output of the ``tune`` subcommand. +# +# In the simplest form, tuning requires you to provide three things: +# +# - the target specification of the device you intend to run this model on +# - the path to an output file in which the tuning records will be stored, and +# finally +# - a path to the model to be tuned. +# +# The example below demonstrates how that works in practice: +# +# .. code-block:: bash +# +# tvmc tune \ +# --target "llvm" \ +# --output resnet50-v2-7-autotuner_records.json \ +# resnet50-v2-7.onnx +# +# In this example, you will see better results if you indicate a more specific +# target for the `--target` flag. For example, on an Intel i7 processor you +# could use `--target llvm -mcpu=skylake`. For this tuning example, we are +# tuning locally on the CPU using LLVM as the compiler for the specified +# achitecture. +# +# TVMC will perform a search against the parameter space for the model, trying +# out different configurations for operators and choosing the one that runs +# fastest on your platform. Although this is a guided search based on the CPU +# and model operations, it can still take several hours to complete the search. +# The output of this search will be saved to the +# `resnet50-v2-7-autotuner_records.json` file, which will later be used to +# compile an optimized model. +# +# .. note:: Defining the Tuning Search Algorithm +# +# By default this search is guided using an `XGBoost Grid` algorithm. +# Depending on your model complexity and amount of time avilable, you might +# want to choose a different algorithm. A full list is available by +# consulting ``tvmc tune --help``. +# +# The output will look something like this for a consumer-level Skylake CPU: +# +# .. code-block:: bash +# +# tvmc tune --target "llvm -mcpu=broadwell" --output resnet50-v2-7-autotuner_records.json resnet50-v2-7.onnx +# # [Task 1/24] Current/Best: 9.65/ 23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done. +# # [Task 1/24] Current/Best: 3.56/ 23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done. +# # [Task 2/24] Current/Best: 13.13/ 58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done. +# # [Task 3/24] Current/Best: 31.93/ 59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done. +# # [Task 4/24] Current/Best: 16.42/ 57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done. +# # [Task 5/24] Current/Best: 12.42/ 57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done. +# # [Task 6/24] Current/Best: 20.66/ 59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done. +# # [Task 7/24] Current/Best: 15.48/ 59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done. +# # [Task 8/24] Current/Best: 31.97/ 59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done. +# # [Task 9/24] Current/Best: 34.14/ 60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done. +# # [Task 10/24] Current/Best: 12.53/ 58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done. +# # [Task 11/24] Current/Best: 30.94/ 58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done. +# # [Task 12/24] Current/Best: 23.66/ 58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done. +# # [Task 13/24] Current/Best: 25.44/ 59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done. +# # [Task 14/24] Current/Best: 26.83/ 58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done. +# # [Task 15/24] Current/Best: 33.64/ 58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done. +# # [Task 16/24] Current/Best: 14.93/ 57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done. +# # [Task 17/24] Current/Best: 28.70/ 58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done. +# # [Task 18/24] Current/Best: 19.01/ 60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done. +# # [Task 19/24] Current/Best: 14.61/ 57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done. +# # [Task 20/24] Current/Best: 10.47/ 57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done. +# # [Task 21/24] Current/Best: 34.37/ 58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done. +# # [Task 22/24] Current/Best: 15.75/ 57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done. +# # [Task 23/24] Current/Best: 23.23/ 58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done. +# # [Task 24/24] Current/Best: 17.27/ 55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done. +# +# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning +# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning +# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information. +# + +################################################################################ +# Compiling an Optimized Model with Tuning Data +# ---------------------------------------------- +# +# As an output of the tuning process above, we obtained the tuning records +# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in +# two ways: +# +# - As input to further tuning (via ``tvmc tune --tuning-records``). +# - As input to the compiler +# +# The compiler will use the results to generate high performance code for the +# model on your specified target. To do that we can use ``tvmc compile +# --tuning-records``. Check ``tvmc compile --help`` for more information. +# +# Now that tuning data for the model has been collected, we can re-compile the +# model using optimized operators to speed up our computations. +# +# .. code-block:: bash +# +# tvmc compile \ +# --target "llvm" \ +# --tuning-records resnet50-v2-7-autotuner_records.json \ +# --output resnet50-v2-7-tvm_autotuned.tar \ +# resnet50-v2-7.onnx +# +# Verify that the optimized model runs and produces the same results: +# +# .. code-block:: bash +# +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# resnet50-v2-7-tvm_autotuned.tar +# +# python postprocess.py +# +# Verifying that the predictions are the same: +# +# .. code-block:: bash +# +# # class='n02123045 tabby, tabby cat' with probability=0.610550 +# # class='n02123159 tiger cat' with probability=0.367181 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 + +################################################################################ +# Comparing the Tuned and Untuned Models +# -------------------------------------- +# +# TVMC gives you tools for basic performance benchmarking between the models. +# You can specify a number of repetitions and that TVMC report on the model run +# time (independent of runtime startup). We can get a rough idea of how much +# tuning has improved the model performance. For example, on a test Intel i7 +# system, we see that the tuned model runs 47% faster than the untuned model: +# +# .. code-block:: bash +# +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# --print-time \ +# --repeat 100 \ +# resnet50-v2-7-tvm_autotuned.tar +# +# # Execution time summary: +# # mean (ms) max (ms) min (ms) std (ms) +# # 92.19 115.73 89.85 3.15 +# +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# --print-time \ +# --repeat 100 \ +# resnet50-v2-7-tvm.tar +# +# # Execution time summary: +# # mean (ms) max (ms) min (ms) std (ms) +# # 193.32 219.97 185.04 7.11 +# + + +################################################################################ +# Final Remarks +# ------------- +# +# In this tutorial, we presented TVMC, a command line driver for TVM. We +# demonstrated how to compile, run, and tune a model. We also discussed the +# need for pre and post-processing of inputs and outputs. After the tuning +# process, we demonstrated how to compare the performance of the unoptimized +# and optimize models. +# +# Here we presented a simple example using ResNet 50 V2 locally. However, TVMC +# supports many more features including cross-compilation, remote execution and +# profiling/benchmarking. +# +# To see what other options are available, please have a look at ``tvmc +# --help``. +# +# In the next tutorial, `Compiling and Optimizing a Model with the Python +# Interface `_, we will cover the same compilation +# and optimization steps using the Python interface. From 7c6d038139fecb90939b781d286359369cff4115 Mon Sep 17 00:00:00 2001 From: Chris Hoge Date: Fri, 13 Aug 2021 03:56:12 +0000 Subject: [PATCH 2/2] More documentation refactoring --- docs/conf.py | 75 +- docs/dev_how_tos/index.rst | 117 ++- .../code_guide.rst | 0 .../code_review.rst | 0 .../committer_guide.rst | 0 .../community.rst | 0 .../{contribute => dev_tutorial}/document.rst | 0 .../error_handling.rst | 0 .../git_howto.rst | 0 docs/{contribute => dev_tutorial}/index.rst | 4 +- .../pull_request.rst | 0 .../release_process.rst | 0 docs/dev_tutorials/index.rst | 24 - docs/index.rst | 8 +- docs/user_how_tos/index.rst | 14 +- docs/{ => user_how_tos}/install/docker.rst | 0 .../install/from_source.rst | 0 docs/{ => user_how_tos}/install/index.rst | 2 +- docs/{ => user_how_tos}/install/nnpack.rst | 0 docs/user_reference/index.rst | 6 +- docs/user_tutorial/index.rst | 223 +++++ docs/user_tutorials/index.rst | 25 - gallery/dev_how_tos/README.txt | 5 + .../dev_how_tos}/bring_your_own_datatypes.py | 0 .../dev_how_tos}/low_level_custom_pass.py | 0 .../dev_how_tos}/use_pass_infra.py | 0 .../dev_how_tos}/use_pass_instrument.py | 0 .../user_how_tos/compile_models/README.txt | 4 + .../compile_models}/from_caffe2.py | 0 .../compile_models}/from_coreml.py | 0 .../compile_models}/from_darknet.py | 0 .../compile_models}/from_keras.py | 0 .../compile_models}/from_mxnet.py | 0 .../user_how_tos/compile_models}/from_onnx.py | 0 .../compile_models}/from_pytorch.py | 0 .../compile_models}/from_tensorflow.py | 0 .../compile_models}/from_tflite.py | 0 gallery/user_how_tos/deploy_models/README.txt | 4 + .../deploy_models}/deploy_model_on_android.py | 0 .../deploy_models}/deploy_model_on_rasp.py | 0 .../deploy_object_detection_pytorch.py | 0 .../deploy_models}/deploy_prequantized.py | 0 .../deploy_prequantized_tflite.py | 0 .../deploy_models}/deploy_quantized.py | 0 .../deploy_models}/deploy_sparse.py | 0 .../deploy_models}/deploy_ssd_gluoncv.py | 0 .../optimize_tensor_operators}/README.txt | 2 + .../opt_conv_cuda.py | 0 .../opt_conv_tensorcore.py | 0 .../optimize_tensor_operators}/opt_gemm.py | 0 .../tune_with_autoscheduler/README.txt | 4 + .../ci_logs/conv2d.json | 0 .../ci_logs/matmul.json | 0 .../ci_logs/resnet-18-NHWC-B1-cuda.json | 0 .../ci_logs/resnet-50-NHWC-B1-llvm.json | 0 .../ci_logs/sparse_dense.json | 0 .../tune_conv2d_layer_cuda.py | 0 .../tune_network_arm.py | 0 .../tune_network_cuda.py | 0 .../tune_network_mali.py | 0 .../tune_network_x86.py | 0 .../tune_sparse_x86.py | 0 .../tune_with_templates_autotvm/README.txt | 4 + .../tune_conv2d_cuda.py | 0 .../tune_relay_arm.py | 0 .../tune_relay_cuda.py | 0 .../tune_relay_mobile_gpu.py | 0 .../tune_relay_x86.py | 0 .../work_with_microtvm/README.txt | 3 + .../work_with_microtvm}/micro_reference_vm.py | 0 .../work_with_microtvm}/micro_tflite.py | 0 .../user_how_tos/work_with_relay/README.txt | 4 + .../work_with_relay}/build_gcn.py | 0 .../work_with_relay}/using_external_lib.py | 0 .../work_with_te_schedules/README.txt | 4 + .../work_with_te_schedules}/extern_op.py | 0 .../work_with_te_schedules}/intrin_math.py | 0 .../work_with_te_schedules}/reduction.py | 0 .../work_with_te_schedules}/scan.py | 0 .../schedule_primitives.py | 0 .../work_with_te_schedules}/tedd.py | 0 .../work_with_te_schedules}/tensorize.py | 0 .../work_with_te_schedules}/tuple_inputs.py | 0 .../README.txt | 4 +- .../auto_scheduler_matmul_x86.py | 0 .../autotvm_matmul_x86.py | 0 .../autotvm_relay_x86.py | 0 .../cross_compilation_and_rpc.py | 0 .../install.py | 0 .../user_tutorial}/intro_topi.py | 0 .../introduction.py | 0 .../relay_quick_start.py | 0 .../tensor_expr_get_started.py | 0 .../tvmc_command_line_driver.py | 0 tutorials/auto_scheduler/README.txt | 2 - tutorials/autotvm/README.txt | 4 - tutorials/dev/README.txt | 3 - tutorials/frontend/README.txt | 4 - tutorials/get_started/README.txt | 2 - .../get_started/auto_scheduler_matmul_x86.py | 214 ----- tutorials/get_started/autotvm_matmul_x86.py | 377 -------- tutorials/get_started/autotvm_relay_x86.py | 476 --------- .../get_started/cross_compilation_and_rpc.py | 265 ----- tutorials/get_started/install.py | 50 - tutorials/get_started/introduction.py | 134 --- tutorials/get_started/relay_quick_start.py | 155 --- .../get_started/tensor_expr_get_started.py | 903 ------------------ .../get_started/tvmc_command_line_driver.py | 511 ---------- tutorials/language/README.txt | 2 - tutorials/micro/README.txt | 4 - tutorials/topi/README.txt | 2 - 111 files changed, 433 insertions(+), 3211 deletions(-) rename docs/{contribute => dev_tutorial}/code_guide.rst (100%) rename docs/{contribute => dev_tutorial}/code_review.rst (100%) rename docs/{contribute => dev_tutorial}/committer_guide.rst (100%) rename docs/{contribute => dev_tutorial}/community.rst (100%) rename docs/{contribute => dev_tutorial}/document.rst (100%) rename docs/{contribute => dev_tutorial}/error_handling.rst (100%) rename docs/{contribute => dev_tutorial}/git_howto.rst (100%) rename docs/{contribute => dev_tutorial}/index.rst (96%) rename docs/{contribute => dev_tutorial}/pull_request.rst (100%) rename docs/{contribute => dev_tutorial}/release_process.rst (100%) delete mode 100644 docs/dev_tutorials/index.rst rename docs/{ => user_how_tos}/install/docker.rst (100%) rename docs/{ => user_how_tos}/install/from_source.rst (100%) rename docs/{ => user_how_tos}/install/index.rst (98%) rename docs/{ => user_how_tos}/install/nnpack.rst (100%) create mode 100644 docs/user_tutorial/index.rst delete mode 100644 docs/user_tutorials/index.rst create mode 100644 gallery/dev_how_tos/README.txt rename {tutorials/dev => gallery/dev_how_tos}/bring_your_own_datatypes.py (100%) rename {tutorials/dev => gallery/dev_how_tos}/low_level_custom_pass.py (100%) rename {tutorials/dev => gallery/dev_how_tos}/use_pass_infra.py (100%) rename {tutorials/dev => gallery/dev_how_tos}/use_pass_instrument.py (100%) create mode 100644 gallery/user_how_tos/compile_models/README.txt rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_caffe2.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_coreml.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_darknet.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_keras.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_mxnet.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_onnx.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_pytorch.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_tensorflow.py (100%) rename {tutorials/frontend => gallery/user_how_tos/compile_models}/from_tflite.py (100%) create mode 100644 gallery/user_how_tos/deploy_models/README.txt rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_model_on_android.py (100%) rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_model_on_rasp.py (100%) rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_object_detection_pytorch.py (100%) rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_prequantized.py (100%) rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_prequantized_tflite.py (100%) rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_quantized.py (100%) rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_sparse.py (100%) rename {tutorials/frontend => gallery/user_how_tos/deploy_models}/deploy_ssd_gluoncv.py (100%) rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/README.txt (53%) rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/opt_conv_cuda.py (100%) rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/opt_conv_tensorcore.py (100%) rename {tutorials/optimize => gallery/user_how_tos/optimize_tensor_operators}/opt_gemm.py (100%) create mode 100644 gallery/user_how_tos/tune_with_autoscheduler/README.txt rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/conv2d.json (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/matmul.json (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/resnet-18-NHWC-B1-cuda.json (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/resnet-50-NHWC-B1-llvm.json (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/ci_logs/sparse_dense.json (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_conv2d_layer_cuda.py (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_arm.py (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_cuda.py (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_mali.py (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_network_x86.py (100%) rename {tutorials/auto_scheduler => gallery/user_how_tos/tune_with_autoscheduler}/tune_sparse_x86.py (100%) create mode 100644 gallery/user_how_tos/tune_with_templates_autotvm/README.txt rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_conv2d_cuda.py (100%) rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_arm.py (100%) rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_cuda.py (100%) rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_mobile_gpu.py (100%) rename {tutorials/autotvm => gallery/user_how_tos/tune_with_templates_autotvm}/tune_relay_x86.py (100%) create mode 100644 gallery/user_how_tos/work_with_microtvm/README.txt rename {tutorials/micro => gallery/user_how_tos/work_with_microtvm}/micro_reference_vm.py (100%) rename {tutorials/micro => gallery/user_how_tos/work_with_microtvm}/micro_tflite.py (100%) create mode 100644 gallery/user_how_tos/work_with_relay/README.txt rename {tutorials/frontend => gallery/user_how_tos/work_with_relay}/build_gcn.py (100%) rename {tutorials/frontend => gallery/user_how_tos/work_with_relay}/using_external_lib.py (100%) create mode 100644 gallery/user_how_tos/work_with_te_schedules/README.txt rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/extern_op.py (100%) rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/intrin_math.py (100%) rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/reduction.py (100%) rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/scan.py (100%) rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/schedule_primitives.py (100%) rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/tedd.py (100%) rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/tensorize.py (100%) rename {tutorials/language => gallery/user_how_tos/work_with_te_schedules}/tuple_inputs.py (100%) rename gallery/{user_tutorials => user_tutorial}/README.txt (70%) rename gallery/{user_tutorials => user_tutorial}/auto_scheduler_matmul_x86.py (100%) rename gallery/{user_tutorials => user_tutorial}/autotvm_matmul_x86.py (100%) rename gallery/{user_tutorials => user_tutorial}/autotvm_relay_x86.py (100%) rename gallery/{user_tutorials => user_tutorial}/cross_compilation_and_rpc.py (100%) rename gallery/{user_tutorials => user_tutorial}/install.py (100%) rename {tutorials/topi => gallery/user_tutorial}/intro_topi.py (100%) rename gallery/{user_tutorials => user_tutorial}/introduction.py (100%) rename gallery/{user_tutorials => user_tutorial}/relay_quick_start.py (100%) rename gallery/{user_tutorials => user_tutorial}/tensor_expr_get_started.py (100%) rename gallery/{user_tutorials => user_tutorial}/tvmc_command_line_driver.py (100%) delete mode 100644 tutorials/auto_scheduler/README.txt delete mode 100644 tutorials/autotvm/README.txt delete mode 100644 tutorials/dev/README.txt delete mode 100644 tutorials/frontend/README.txt delete mode 100644 tutorials/get_started/README.txt delete mode 100644 tutorials/get_started/auto_scheduler_matmul_x86.py delete mode 100644 tutorials/get_started/autotvm_matmul_x86.py delete mode 100644 tutorials/get_started/autotvm_relay_x86.py delete mode 100644 tutorials/get_started/cross_compilation_and_rpc.py delete mode 100644 tutorials/get_started/install.py delete mode 100644 tutorials/get_started/introduction.py delete mode 100644 tutorials/get_started/relay_quick_start.py delete mode 100644 tutorials/get_started/tensor_expr_get_started.py delete mode 100644 tutorials/get_started/tvmc_command_line_driver.py delete mode 100644 tutorials/language/README.txt delete mode 100644 tutorials/micro/README.txt delete mode 100644 tutorials/topi/README.txt diff --git a/docs/conf.py b/docs/conf.py index 6962c448ed53..56506482bc55 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -197,12 +197,43 @@ def git_describe_version(original_version): from sphinx_gallery.sorting import ExplicitOrder -examples_dirs = ["../gallery/user_tutorials", "../tutorials/", "../vta/tutorials/"] -gallery_dirs = ["user_tutorials", "tutorials", "vta/tutorials"] +# This variable specifices where splinx gallery should look +# for gallery files. +examples_dirs = ["../gallery/user_tutorial", + "../gallery/user_how_tos/compile_models", + "../gallery/user_how_tos/deploy_models", + "../gallery/user_how_tos/work_with_relay", + "../gallery/user_how_tos/work_with_te_schedules", + "../gallery/user_how_tos/optimize_tensor_operators", + "../gallery/user_how_tos/tune_with_templates_autotvm", + "../gallery/user_how_tos/tune_with_autoscheduler", + "../gallery/user_how_tos/work_with_microtvm", + "../gallery/dev_how_tos", + "../tutorials/", + "../vta/tutorials/"] + +# This variable specifies where sphinx gallery should place +# output from gallery generation, relative to the top level +# docd directory. The output will be mapped as an ordered +# tuple to the example_dirs. For example, the output +# from ../gallery/user_tutorials will be rendered +# in tvm/docs/user_tutorials. +gallery_dirs = ["user_tutorial", + "user_how_tos/compile_models", + "user_how_tos/deploy_models", + "user_how_tos/work_with_relay", + "user_how_tos/work_with_te_schedules", + "user_how_tos/optimize_tensor_operators", + "user_how_tos/tune_with_templates_autotvm", + "user_how_tos/tune_with_autoscheduler", + "user_how_tos/work_with_microtvm", + "dev_how_tos", + "tutorials", + "vta/tutorials"] subsection_order = ExplicitOrder( [ - "../gallery/user_tutorials", + "../gallery/user_tutorial", "../tutorials/get_started", "../tutorials/frontend", "../tutorials/language", @@ -224,7 +255,7 @@ def git_describe_version(original_version): # The unlisted files are sorted by filenames. # The unlisted files always appear after listed files. within_subsection_order = { - "user_tutorials": [ + "user_tutorial": [ "introduction.py", "install.py", "tvmc_command_line_driver.py", @@ -233,20 +264,10 @@ def git_describe_version(original_version): "autotvm_matmul_x86.py", "auto_scheduler_matmul_x86.py", "cross_compilation_and_rpc.py", - "relay_quick_start.py", + "intro_topi.py", + "relay_quick_start.py" ], - "get_started": [ - "introduction.py", - "install.py", - "tvmc_command_line_driver.py", - "autotvm_relay_x86.py", - "tensor_expr_get_started.py", - "autotvm_matmul_x86.py", - "auto_scheduler_matmul_x86.py", - "cross_compilation_and_rpc.py", - "relay_quick_start.py", - ], - "frontend": [ + "compile_models": [ "from_pytorch.py", "from_tensorflow.py", "from_mxnet.py", @@ -257,7 +278,17 @@ def git_describe_version(original_version): "from_darknet.py", "from_caffe2.py", ], - "language": [ + "deploy_models": [ + "deploy_object_detection_pytorch.py", + "deploy_model_on_rasp.py", + "deploy_sparse.py", + "deploy_quantized.py", + "deploy_prequantized.py", + "deploy_prequantized_tflite.py", + "deploy_model_on_android.py", + "deploy_ssd_gluoncv.py" + ], + "work_with_te_schedules": [ "schedule_primitives.py", "reduction.py", "intrin_math.py", @@ -267,12 +298,12 @@ def git_describe_version(original_version): "tuple_inputs.py", "tedd.py", ], - "optimize": [ + "optimize_tensor_operators": [ "opt_gemm.py", "opt_conv_cuda.py", "opt_conv_tensorcore.py", ], - "autotvm": [ + "tune_with_templates_autotvm": [ "tune_simple_template.py", "tune_conv2d_cuda.py", "tune_relay_cuda.py", @@ -280,13 +311,13 @@ def git_describe_version(original_version): "tune_relay_arm.py", "tune_relay_mobile_gpu.py", ], - "auto_scheduler": [ + "tune_with_autoscheduler": [ "tune_matmul_x86.py", "tune_conv2d_layer_cuda.py", "tune_network_x86.py", "tune_network_cuda.py", ], - "dev": [ + "dev_how_tos": [ "low_level_custom_pass.py", "use_pass_infra.py", "use_pass_instrument.py", diff --git a/docs/dev_how_tos/index.rst b/docs/dev_how_tos/index.rst index e8cce9365df2..a578a206cd0d 100644 --- a/docs/dev_how_tos/index.rst +++ b/docs/dev_how_tos/index.rst @@ -1,25 +1,104 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at +:orphan: -.. http://www.apache.org/licenses/LICENSE-2.0 -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -.. _reference: +.. _sphx_glr_dev_how_tos: -Deep Dive -========= -TVM Developer Deep Dive +How To +------ +This gallery includes a number of how-tos on how to accomplish common +develpment tasks in TVM. -Refactor placeholder + + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /dev_how_tos/images/thumb/sphx_glr_low_level_custom_pass_thumb.png + + :ref:`sphx_glr_dev_how_tos_low_level_custom_pass.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /dev_how_tos/low_level_custom_pass + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /dev_how_tos/images/thumb/sphx_glr_use_pass_infra_thumb.png + + :ref:`sphx_glr_dev_how_tos_use_pass_infra.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /dev_how_tos/use_pass_infra + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /dev_how_tos/images/thumb/sphx_glr_use_pass_instrument_thumb.png + + :ref:`sphx_glr_dev_how_tos_use_pass_instrument.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /dev_how_tos/use_pass_instrument + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /dev_how_tos/images/thumb/sphx_glr_bring_your_own_datatypes_thumb.png + + :ref:`sphx_glr_dev_how_tos_bring_your_own_datatypes.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /dev_how_tos/bring_your_own_datatypes +.. raw:: html + +
+ + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/contribute/code_guide.rst b/docs/dev_tutorial/code_guide.rst similarity index 100% rename from docs/contribute/code_guide.rst rename to docs/dev_tutorial/code_guide.rst diff --git a/docs/contribute/code_review.rst b/docs/dev_tutorial/code_review.rst similarity index 100% rename from docs/contribute/code_review.rst rename to docs/dev_tutorial/code_review.rst diff --git a/docs/contribute/committer_guide.rst b/docs/dev_tutorial/committer_guide.rst similarity index 100% rename from docs/contribute/committer_guide.rst rename to docs/dev_tutorial/committer_guide.rst diff --git a/docs/contribute/community.rst b/docs/dev_tutorial/community.rst similarity index 100% rename from docs/contribute/community.rst rename to docs/dev_tutorial/community.rst diff --git a/docs/contribute/document.rst b/docs/dev_tutorial/document.rst similarity index 100% rename from docs/contribute/document.rst rename to docs/dev_tutorial/document.rst diff --git a/docs/contribute/error_handling.rst b/docs/dev_tutorial/error_handling.rst similarity index 100% rename from docs/contribute/error_handling.rst rename to docs/dev_tutorial/error_handling.rst diff --git a/docs/contribute/git_howto.rst b/docs/dev_tutorial/git_howto.rst similarity index 100% rename from docs/contribute/git_howto.rst rename to docs/dev_tutorial/git_howto.rst diff --git a/docs/contribute/index.rst b/docs/dev_tutorial/index.rst similarity index 96% rename from docs/contribute/index.rst rename to docs/dev_tutorial/index.rst index e3e4119d803e..960071b189cb 100644 --- a/docs/contribute/index.rst +++ b/docs/dev_tutorial/index.rst @@ -15,8 +15,8 @@ specific language governing permissions and limitations under the License. -Contribute to TVM -================= +Tutorial: Contributing to TVM +============================= TVM has been developed by community members. Everyone is welcomed to contribute. diff --git a/docs/contribute/pull_request.rst b/docs/dev_tutorial/pull_request.rst similarity index 100% rename from docs/contribute/pull_request.rst rename to docs/dev_tutorial/pull_request.rst diff --git a/docs/contribute/release_process.rst b/docs/dev_tutorial/release_process.rst similarity index 100% rename from docs/contribute/release_process.rst rename to docs/dev_tutorial/release_process.rst diff --git a/docs/dev_tutorials/index.rst b/docs/dev_tutorials/index.rst deleted file mode 100644 index 2705f21907b0..000000000000 --- a/docs/dev_tutorials/index.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _tutorial: - -Tutorial -======== -TVM Developer Tutorial - -Placeholder for documentation refactor diff --git a/docs/index.rst b/docs/index.rst index ad1cfc49d4dd..09f1e0ca727c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -35,21 +35,21 @@ For Developers - :doc:`dev/how_to` gives quick development tips on various topics. .. toctree:: - :maxdepth: 2 + :maxdepth: 3 :caption: TVM User Guide :hidden: - user_tutorials/index + user_tutorial/index user_how_tos/index user_deep_dive/index user_reference/index .. toctree:: - :maxdepth: 1 + :maxdepth: 3 :caption: TVM Developer Guide :hidden: - dev_tutorials/index + dev_tutorial/index dev_how_tos/index dev_deep_dive/index dev_reference/index diff --git a/docs/user_how_tos/index.rst b/docs/user_how_tos/index.rst index cc59879e1057..8684a9e729c5 100644 --- a/docs/user_how_tos/index.rst +++ b/docs/user_how_tos/index.rst @@ -15,9 +15,21 @@ specific language governing permissions and limitations under the License. -.. _user_guide: +.. _user_how_tos: How To ======= TVM How Tos +.. toctree:: + :maxdepth: 2 + + install/index + compile_models/index + deploy_models/index + work_with_relay/index + work_with_te_schedules/index + optimize_tensor_operators/index + tune_with_templates_autotvm/index + tune_with_autoscheduler/index + work_with_microtvm/index diff --git a/docs/install/docker.rst b/docs/user_how_tos/install/docker.rst similarity index 100% rename from docs/install/docker.rst rename to docs/user_how_tos/install/docker.rst diff --git a/docs/install/from_source.rst b/docs/user_how_tos/install/from_source.rst similarity index 100% rename from docs/install/from_source.rst rename to docs/user_how_tos/install/from_source.rst diff --git a/docs/install/index.rst b/docs/user_how_tos/install/index.rst similarity index 98% rename from docs/install/index.rst rename to docs/user_how_tos/install/index.rst index 5f739418add3..3d1a06a74be7 100644 --- a/docs/install/index.rst +++ b/docs/user_how_tos/install/index.rst @@ -17,7 +17,7 @@ .. _installation: -Installation +Install TVM ============ To install TVM, please read :ref:`install-from-source`. If you are interested in deploying to mobile/embedded devices, diff --git a/docs/install/nnpack.rst b/docs/user_how_tos/install/nnpack.rst similarity index 100% rename from docs/install/nnpack.rst rename to docs/user_how_tos/install/nnpack.rst diff --git a/docs/user_reference/index.rst b/docs/user_reference/index.rst index cc59879e1057..1f715217b2a1 100644 --- a/docs/user_reference/index.rst +++ b/docs/user_reference/index.rst @@ -17,7 +17,7 @@ .. _user_guide: -How To -======= -TVM How Tos +User Reference +============== +TVM User Reference diff --git a/docs/user_tutorial/index.rst b/docs/user_tutorial/index.rst new file mode 100644 index 000000000000..d239fc6a459d --- /dev/null +++ b/docs/user_tutorial/index.rst @@ -0,0 +1,223 @@ +:orphan: + + + +.. _sphx_glr_user_tutorial: + +Tutorial +-------- + +Within this gallery is an introduction to TVM. + + + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_introduction_thumb.png + + :ref:`sphx_glr_user_tutorial_introduction.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/introduction + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_install_thumb.png + + :ref:`sphx_glr_user_tutorial_install.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/install + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_tvmc_command_line_driver_thumb.png + + :ref:`sphx_glr_user_tutorial_tvmc_command_line_driver.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/tvmc_command_line_driver + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_autotvm_relay_x86_thumb.png + + :ref:`sphx_glr_user_tutorial_autotvm_relay_x86.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/autotvm_relay_x86 + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_tensor_expr_get_started_thumb.png + + :ref:`sphx_glr_user_tutorial_tensor_expr_get_started.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/tensor_expr_get_started + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_autotvm_matmul_x86_thumb.png + + :ref:`sphx_glr_user_tutorial_autotvm_matmul_x86.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/autotvm_matmul_x86 + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_auto_scheduler_matmul_x86_thumb.png + + :ref:`sphx_glr_user_tutorial_auto_scheduler_matmul_x86.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/auto_scheduler_matmul_x86 + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_cross_compilation_and_rpc_thumb.png + + :ref:`sphx_glr_user_tutorial_cross_compilation_and_rpc.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/cross_compilation_and_rpc + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_intro_topi_thumb.png + + :ref:`sphx_glr_user_tutorial_intro_topi.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/intro_topi + +.. raw:: html + +
+ +.. only:: html + + .. figure:: /user_tutorial/images/thumb/sphx_glr_relay_quick_start_thumb.png + + :ref:`sphx_glr_user_tutorial_relay_quick_start.py` + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /user_tutorial/relay_quick_start +.. raw:: html + +
+ + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/user_tutorials/index.rst b/docs/user_tutorials/index.rst deleted file mode 100644 index 4f01312426f2..000000000000 --- a/docs/user_tutorials/index.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _user_guide: - -Tutorial -========= -TVM User Tutorial - -This is a placeholder and is overwritten by Spinx Gallery -/tvm/gallery/user_tutorials diff --git a/gallery/dev_how_tos/README.txt b/gallery/dev_how_tos/README.txt new file mode 100644 index 000000000000..10c824a554f4 --- /dev/null +++ b/gallery/dev_how_tos/README.txt @@ -0,0 +1,5 @@ +How To +------ +This gallery includes a number of how-tos on how to accomplish common +develpment tasks in TVM. + diff --git a/tutorials/dev/bring_your_own_datatypes.py b/gallery/dev_how_tos/bring_your_own_datatypes.py similarity index 100% rename from tutorials/dev/bring_your_own_datatypes.py rename to gallery/dev_how_tos/bring_your_own_datatypes.py diff --git a/tutorials/dev/low_level_custom_pass.py b/gallery/dev_how_tos/low_level_custom_pass.py similarity index 100% rename from tutorials/dev/low_level_custom_pass.py rename to gallery/dev_how_tos/low_level_custom_pass.py diff --git a/tutorials/dev/use_pass_infra.py b/gallery/dev_how_tos/use_pass_infra.py similarity index 100% rename from tutorials/dev/use_pass_infra.py rename to gallery/dev_how_tos/use_pass_infra.py diff --git a/tutorials/dev/use_pass_instrument.py b/gallery/dev_how_tos/use_pass_instrument.py similarity index 100% rename from tutorials/dev/use_pass_instrument.py rename to gallery/dev_how_tos/use_pass_instrument.py diff --git a/gallery/user_how_tos/compile_models/README.txt b/gallery/user_how_tos/compile_models/README.txt new file mode 100644 index 000000000000..871d975ab68d --- /dev/null +++ b/gallery/user_how_tos/compile_models/README.txt @@ -0,0 +1,4 @@ +Compile Deep Learning Models +============================ + +How to compile models from a variety of different frameworks diff --git a/tutorials/frontend/from_caffe2.py b/gallery/user_how_tos/compile_models/from_caffe2.py similarity index 100% rename from tutorials/frontend/from_caffe2.py rename to gallery/user_how_tos/compile_models/from_caffe2.py diff --git a/tutorials/frontend/from_coreml.py b/gallery/user_how_tos/compile_models/from_coreml.py similarity index 100% rename from tutorials/frontend/from_coreml.py rename to gallery/user_how_tos/compile_models/from_coreml.py diff --git a/tutorials/frontend/from_darknet.py b/gallery/user_how_tos/compile_models/from_darknet.py similarity index 100% rename from tutorials/frontend/from_darknet.py rename to gallery/user_how_tos/compile_models/from_darknet.py diff --git a/tutorials/frontend/from_keras.py b/gallery/user_how_tos/compile_models/from_keras.py similarity index 100% rename from tutorials/frontend/from_keras.py rename to gallery/user_how_tos/compile_models/from_keras.py diff --git a/tutorials/frontend/from_mxnet.py b/gallery/user_how_tos/compile_models/from_mxnet.py similarity index 100% rename from tutorials/frontend/from_mxnet.py rename to gallery/user_how_tos/compile_models/from_mxnet.py diff --git a/tutorials/frontend/from_onnx.py b/gallery/user_how_tos/compile_models/from_onnx.py similarity index 100% rename from tutorials/frontend/from_onnx.py rename to gallery/user_how_tos/compile_models/from_onnx.py diff --git a/tutorials/frontend/from_pytorch.py b/gallery/user_how_tos/compile_models/from_pytorch.py similarity index 100% rename from tutorials/frontend/from_pytorch.py rename to gallery/user_how_tos/compile_models/from_pytorch.py diff --git a/tutorials/frontend/from_tensorflow.py b/gallery/user_how_tos/compile_models/from_tensorflow.py similarity index 100% rename from tutorials/frontend/from_tensorflow.py rename to gallery/user_how_tos/compile_models/from_tensorflow.py diff --git a/tutorials/frontend/from_tflite.py b/gallery/user_how_tos/compile_models/from_tflite.py similarity index 100% rename from tutorials/frontend/from_tflite.py rename to gallery/user_how_tos/compile_models/from_tflite.py diff --git a/gallery/user_how_tos/deploy_models/README.txt b/gallery/user_how_tos/deploy_models/README.txt new file mode 100644 index 000000000000..cf8abf4662ff --- /dev/null +++ b/gallery/user_how_tos/deploy_models/README.txt @@ -0,0 +1,4 @@ +Deploy Deep Learning Models +============================ + +How to deploy models from a variety of different frameworks diff --git a/tutorials/frontend/deploy_model_on_android.py b/gallery/user_how_tos/deploy_models/deploy_model_on_android.py similarity index 100% rename from tutorials/frontend/deploy_model_on_android.py rename to gallery/user_how_tos/deploy_models/deploy_model_on_android.py diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/gallery/user_how_tos/deploy_models/deploy_model_on_rasp.py similarity index 100% rename from tutorials/frontend/deploy_model_on_rasp.py rename to gallery/user_how_tos/deploy_models/deploy_model_on_rasp.py diff --git a/tutorials/frontend/deploy_object_detection_pytorch.py b/gallery/user_how_tos/deploy_models/deploy_object_detection_pytorch.py similarity index 100% rename from tutorials/frontend/deploy_object_detection_pytorch.py rename to gallery/user_how_tos/deploy_models/deploy_object_detection_pytorch.py diff --git a/tutorials/frontend/deploy_prequantized.py b/gallery/user_how_tos/deploy_models/deploy_prequantized.py similarity index 100% rename from tutorials/frontend/deploy_prequantized.py rename to gallery/user_how_tos/deploy_models/deploy_prequantized.py diff --git a/tutorials/frontend/deploy_prequantized_tflite.py b/gallery/user_how_tos/deploy_models/deploy_prequantized_tflite.py similarity index 100% rename from tutorials/frontend/deploy_prequantized_tflite.py rename to gallery/user_how_tos/deploy_models/deploy_prequantized_tflite.py diff --git a/tutorials/frontend/deploy_quantized.py b/gallery/user_how_tos/deploy_models/deploy_quantized.py similarity index 100% rename from tutorials/frontend/deploy_quantized.py rename to gallery/user_how_tos/deploy_models/deploy_quantized.py diff --git a/tutorials/frontend/deploy_sparse.py b/gallery/user_how_tos/deploy_models/deploy_sparse.py similarity index 100% rename from tutorials/frontend/deploy_sparse.py rename to gallery/user_how_tos/deploy_models/deploy_sparse.py diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/gallery/user_how_tos/deploy_models/deploy_ssd_gluoncv.py similarity index 100% rename from tutorials/frontend/deploy_ssd_gluoncv.py rename to gallery/user_how_tos/deploy_models/deploy_ssd_gluoncv.py diff --git a/tutorials/optimize/README.txt b/gallery/user_how_tos/optimize_tensor_operators/README.txt similarity index 53% rename from tutorials/optimize/README.txt rename to gallery/user_how_tos/optimize_tensor_operators/README.txt index b051548c5351..a4a3c84c2018 100644 --- a/tutorials/optimize/README.txt +++ b/gallery/user_how_tos/optimize_tensor_operators/README.txt @@ -1,2 +1,4 @@ Optimize Tensor Operators ------------------------- + +How to optimize tensor operators using TVM. diff --git a/tutorials/optimize/opt_conv_cuda.py b/gallery/user_how_tos/optimize_tensor_operators/opt_conv_cuda.py similarity index 100% rename from tutorials/optimize/opt_conv_cuda.py rename to gallery/user_how_tos/optimize_tensor_operators/opt_conv_cuda.py diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/gallery/user_how_tos/optimize_tensor_operators/opt_conv_tensorcore.py similarity index 100% rename from tutorials/optimize/opt_conv_tensorcore.py rename to gallery/user_how_tos/optimize_tensor_operators/opt_conv_tensorcore.py diff --git a/tutorials/optimize/opt_gemm.py b/gallery/user_how_tos/optimize_tensor_operators/opt_gemm.py similarity index 100% rename from tutorials/optimize/opt_gemm.py rename to gallery/user_how_tos/optimize_tensor_operators/opt_gemm.py diff --git a/gallery/user_how_tos/tune_with_autoscheduler/README.txt b/gallery/user_how_tos/tune_with_autoscheduler/README.txt new file mode 100644 index 000000000000..6f69d8aa6f61 --- /dev/null +++ b/gallery/user_how_tos/tune_with_autoscheduler/README.txt @@ -0,0 +1,4 @@ +How to use AutoScheduler for Template-Free Auto Scheduling +---------------------------------------------------------- + +How to use the TVM AutoScheduler to automatically tune a schedule, template-free. diff --git a/tutorials/auto_scheduler/ci_logs/conv2d.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/conv2d.json similarity index 100% rename from tutorials/auto_scheduler/ci_logs/conv2d.json rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/conv2d.json diff --git a/tutorials/auto_scheduler/ci_logs/matmul.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/matmul.json similarity index 100% rename from tutorials/auto_scheduler/ci_logs/matmul.json rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/matmul.json diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json similarity index 100% rename from tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json diff --git a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json similarity index 100% rename from tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json diff --git a/tutorials/auto_scheduler/ci_logs/sparse_dense.json b/gallery/user_how_tos/tune_with_autoscheduler/ci_logs/sparse_dense.json similarity index 100% rename from tutorials/auto_scheduler/ci_logs/sparse_dense.json rename to gallery/user_how_tos/tune_with_autoscheduler/ci_logs/sparse_dense.json diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_conv2d_layer_cuda.py similarity index 100% rename from tutorials/auto_scheduler/tune_conv2d_layer_cuda.py rename to gallery/user_how_tos/tune_with_autoscheduler/tune_conv2d_layer_cuda.py diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_arm.py similarity index 100% rename from tutorials/auto_scheduler/tune_network_arm.py rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_arm.py diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_cuda.py similarity index 100% rename from tutorials/auto_scheduler/tune_network_cuda.py rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_cuda.py diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_mali.py similarity index 100% rename from tutorials/auto_scheduler/tune_network_mali.py rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_mali.py diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_network_x86.py similarity index 100% rename from tutorials/auto_scheduler/tune_network_x86.py rename to gallery/user_how_tos/tune_with_autoscheduler/tune_network_x86.py diff --git a/tutorials/auto_scheduler/tune_sparse_x86.py b/gallery/user_how_tos/tune_with_autoscheduler/tune_sparse_x86.py similarity index 100% rename from tutorials/auto_scheduler/tune_sparse_x86.py rename to gallery/user_how_tos/tune_with_autoscheduler/tune_sparse_x86.py diff --git a/gallery/user_how_tos/tune_with_templates_autotvm/README.txt b/gallery/user_how_tos/tune_with_templates_autotvm/README.txt new file mode 100644 index 000000000000..99d844fef2d9 --- /dev/null +++ b/gallery/user_how_tos/tune_with_templates_autotvm/README.txt @@ -0,0 +1,4 @@ +How to Auto-Tune with Templates and AutoTVM +------------------------------------------- + +How to write schedule templates and auto-tune with AutoTVM diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_conv2d_cuda.py similarity index 100% rename from tutorials/autotvm/tune_conv2d_cuda.py rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_conv2d_cuda.py diff --git a/tutorials/autotvm/tune_relay_arm.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_arm.py similarity index 100% rename from tutorials/autotvm/tune_relay_arm.py rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_arm.py diff --git a/tutorials/autotvm/tune_relay_cuda.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_cuda.py similarity index 100% rename from tutorials/autotvm/tune_relay_cuda.py rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_cuda.py diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_mobile_gpu.py similarity index 100% rename from tutorials/autotvm/tune_relay_mobile_gpu.py rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_mobile_gpu.py diff --git a/tutorials/autotvm/tune_relay_x86.py b/gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_x86.py similarity index 100% rename from tutorials/autotvm/tune_relay_x86.py rename to gallery/user_how_tos/tune_with_templates_autotvm/tune_relay_x86.py diff --git a/gallery/user_how_tos/work_with_microtvm/README.txt b/gallery/user_how_tos/work_with_microtvm/README.txt new file mode 100644 index 000000000000..c77817dd9b7e --- /dev/null +++ b/gallery/user_how_tos/work_with_microtvm/README.txt @@ -0,0 +1,3 @@ +How to Work With microTVM +-------- +How to work with microTVM diff --git a/tutorials/micro/micro_reference_vm.py b/gallery/user_how_tos/work_with_microtvm/micro_reference_vm.py similarity index 100% rename from tutorials/micro/micro_reference_vm.py rename to gallery/user_how_tos/work_with_microtvm/micro_reference_vm.py diff --git a/tutorials/micro/micro_tflite.py b/gallery/user_how_tos/work_with_microtvm/micro_tflite.py similarity index 100% rename from tutorials/micro/micro_tflite.py rename to gallery/user_how_tos/work_with_microtvm/micro_tflite.py diff --git a/gallery/user_how_tos/work_with_relay/README.txt b/gallery/user_how_tos/work_with_relay/README.txt new file mode 100644 index 000000000000..20f7a076e397 --- /dev/null +++ b/gallery/user_how_tos/work_with_relay/README.txt @@ -0,0 +1,4 @@ +Work With Relay +=============== + +How tos describing advanced compilation and deployment techniques with Relay diff --git a/tutorials/frontend/build_gcn.py b/gallery/user_how_tos/work_with_relay/build_gcn.py similarity index 100% rename from tutorials/frontend/build_gcn.py rename to gallery/user_how_tos/work_with_relay/build_gcn.py diff --git a/tutorials/frontend/using_external_lib.py b/gallery/user_how_tos/work_with_relay/using_external_lib.py similarity index 100% rename from tutorials/frontend/using_external_lib.py rename to gallery/user_how_tos/work_with_relay/using_external_lib.py diff --git a/gallery/user_how_tos/work_with_te_schedules/README.txt b/gallery/user_how_tos/work_with_te_schedules/README.txt new file mode 100644 index 000000000000..12499aebc720 --- /dev/null +++ b/gallery/user_how_tos/work_with_te_schedules/README.txt @@ -0,0 +1,4 @@ +Work with Tensor Expression and Schedules +------------------------------- + +How to work with Tensor Expressions and Schedules diff --git a/tutorials/language/extern_op.py b/gallery/user_how_tos/work_with_te_schedules/extern_op.py similarity index 100% rename from tutorials/language/extern_op.py rename to gallery/user_how_tos/work_with_te_schedules/extern_op.py diff --git a/tutorials/language/intrin_math.py b/gallery/user_how_tos/work_with_te_schedules/intrin_math.py similarity index 100% rename from tutorials/language/intrin_math.py rename to gallery/user_how_tos/work_with_te_schedules/intrin_math.py diff --git a/tutorials/language/reduction.py b/gallery/user_how_tos/work_with_te_schedules/reduction.py similarity index 100% rename from tutorials/language/reduction.py rename to gallery/user_how_tos/work_with_te_schedules/reduction.py diff --git a/tutorials/language/scan.py b/gallery/user_how_tos/work_with_te_schedules/scan.py similarity index 100% rename from tutorials/language/scan.py rename to gallery/user_how_tos/work_with_te_schedules/scan.py diff --git a/tutorials/language/schedule_primitives.py b/gallery/user_how_tos/work_with_te_schedules/schedule_primitives.py similarity index 100% rename from tutorials/language/schedule_primitives.py rename to gallery/user_how_tos/work_with_te_schedules/schedule_primitives.py diff --git a/tutorials/language/tedd.py b/gallery/user_how_tos/work_with_te_schedules/tedd.py similarity index 100% rename from tutorials/language/tedd.py rename to gallery/user_how_tos/work_with_te_schedules/tedd.py diff --git a/tutorials/language/tensorize.py b/gallery/user_how_tos/work_with_te_schedules/tensorize.py similarity index 100% rename from tutorials/language/tensorize.py rename to gallery/user_how_tos/work_with_te_schedules/tensorize.py diff --git a/tutorials/language/tuple_inputs.py b/gallery/user_how_tos/work_with_te_schedules/tuple_inputs.py similarity index 100% rename from tutorials/language/tuple_inputs.py rename to gallery/user_how_tos/work_with_te_schedules/tuple_inputs.py diff --git a/gallery/user_tutorials/README.txt b/gallery/user_tutorial/README.txt similarity index 70% rename from gallery/user_tutorials/README.txt rename to gallery/user_tutorial/README.txt index 0d5a287feff4..ee90cd68eefc 100644 --- a/gallery/user_tutorials/README.txt +++ b/gallery/user_tutorial/README.txt @@ -1,4 +1,4 @@ -Tutorials ---------- +Tutorial +-------- Within this gallery is an introduction to TVM. diff --git a/gallery/user_tutorials/auto_scheduler_matmul_x86.py b/gallery/user_tutorial/auto_scheduler_matmul_x86.py similarity index 100% rename from gallery/user_tutorials/auto_scheduler_matmul_x86.py rename to gallery/user_tutorial/auto_scheduler_matmul_x86.py diff --git a/gallery/user_tutorials/autotvm_matmul_x86.py b/gallery/user_tutorial/autotvm_matmul_x86.py similarity index 100% rename from gallery/user_tutorials/autotvm_matmul_x86.py rename to gallery/user_tutorial/autotvm_matmul_x86.py diff --git a/gallery/user_tutorials/autotvm_relay_x86.py b/gallery/user_tutorial/autotvm_relay_x86.py similarity index 100% rename from gallery/user_tutorials/autotvm_relay_x86.py rename to gallery/user_tutorial/autotvm_relay_x86.py diff --git a/gallery/user_tutorials/cross_compilation_and_rpc.py b/gallery/user_tutorial/cross_compilation_and_rpc.py similarity index 100% rename from gallery/user_tutorials/cross_compilation_and_rpc.py rename to gallery/user_tutorial/cross_compilation_and_rpc.py diff --git a/gallery/user_tutorials/install.py b/gallery/user_tutorial/install.py similarity index 100% rename from gallery/user_tutorials/install.py rename to gallery/user_tutorial/install.py diff --git a/tutorials/topi/intro_topi.py b/gallery/user_tutorial/intro_topi.py similarity index 100% rename from tutorials/topi/intro_topi.py rename to gallery/user_tutorial/intro_topi.py diff --git a/gallery/user_tutorials/introduction.py b/gallery/user_tutorial/introduction.py similarity index 100% rename from gallery/user_tutorials/introduction.py rename to gallery/user_tutorial/introduction.py diff --git a/gallery/user_tutorials/relay_quick_start.py b/gallery/user_tutorial/relay_quick_start.py similarity index 100% rename from gallery/user_tutorials/relay_quick_start.py rename to gallery/user_tutorial/relay_quick_start.py diff --git a/gallery/user_tutorials/tensor_expr_get_started.py b/gallery/user_tutorial/tensor_expr_get_started.py similarity index 100% rename from gallery/user_tutorials/tensor_expr_get_started.py rename to gallery/user_tutorial/tensor_expr_get_started.py diff --git a/gallery/user_tutorials/tvmc_command_line_driver.py b/gallery/user_tutorial/tvmc_command_line_driver.py similarity index 100% rename from gallery/user_tutorials/tvmc_command_line_driver.py rename to gallery/user_tutorial/tvmc_command_line_driver.py diff --git a/tutorials/auto_scheduler/README.txt b/tutorials/auto_scheduler/README.txt deleted file mode 100644 index 75986679f0bd..000000000000 --- a/tutorials/auto_scheduler/README.txt +++ /dev/null @@ -1,2 +0,0 @@ -AutoScheduler : Template-free Auto Scheduling ---------------------------------------------- diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt deleted file mode 100644 index a1d33ba088cc..000000000000 --- a/tutorials/autotvm/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -.. _tutorials-autotvm-sec: - -AutoTVM : Template-based Auto Tuning ------------------------------------- diff --git a/tutorials/dev/README.txt b/tutorials/dev/README.txt deleted file mode 100644 index a358280640de..000000000000 --- a/tutorials/dev/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -Developer Tutorials -------------------- - diff --git a/tutorials/frontend/README.txt b/tutorials/frontend/README.txt deleted file mode 100644 index 319506d21f8f..000000000000 --- a/tutorials/frontend/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -.. _tutorial-frontend: - -Compile Deep Learning Models ----------------------------- diff --git a/tutorials/get_started/README.txt b/tutorials/get_started/README.txt deleted file mode 100644 index aa6c559c1b38..000000000000 --- a/tutorials/get_started/README.txt +++ /dev/null @@ -1,2 +0,0 @@ -Getting Started With TVM ------------------------- diff --git a/tutorials/get_started/auto_scheduler_matmul_x86.py b/tutorials/get_started/auto_scheduler_matmul_x86.py deleted file mode 100644 index f9fb3615aedc..000000000000 --- a/tutorials/get_started/auto_scheduler_matmul_x86.py +++ /dev/null @@ -1,214 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Optimizing Operators with Auto-scheduling -========================================= -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_ - -In this tutorial, we will show how TVM's Auto Scheduling feature can find -optimal schedules without the need for writing a custom template. - -Different from the template-based :doc:`AutoTVM ` which relies on -manual templates to define the search space, the auto-scheduler does not -require any templates. Users only need to write the computation declaration -without any schedule commands or templates. The auto-scheduler can -automatically generate a large search space and find a good schedule in the -space. - -We use matrix multiplication as an example in this tutorial. - -.. note:: - Note that this tutorial will not run on Windows or recent versions of macOS. To - get it to run, you will need to wrap the body of this tutorial in a :code:`if - __name__ == "__main__":` block. -""" - -import os - -import numpy as np -import tvm -from tvm import te, auto_scheduler - -################################################################################ -# Defining the Matrix Multiplication -# ---------------------------------- -# To start, we define a matrix multiplication with a bias addition. Note that -# this uses standard operations available in TVMs Tensor Expression language. -# The major difference is the use of the `auto_sceduler` decorator at the top -# of the function definition. The function should return a list of -# input/output tensors. From these tensors, the auto-scheduler can get the -# whole computational graph. - - -@auto_scheduler.register_workload # Note the auto_scheduler decorator -def matmul_add(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - C = te.placeholder((N, M), name="C", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - matmul = te.compute( - (N, M), - lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), - name="matmul", - attrs={"layout_free_placeholders": [B]}, # enable automatic layout transform for tensor B - ) - out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out") - - return [A, B, C, out] - - -################################################################################ -# Create the search task -# ---------------------- -# With the function defined, we can now create the task for the auto_scheduler -# to search against. We specify the particular parameters for this matrix -# multiplication, in this case a multiplication of to square matricies of size -# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32" -# -# .. note:: Improve performance with custom targets -# In order for TVM to take full advantage of specific hardware platforms, -# you will want to manuall specify your CPU capabilities. For example: -# - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2 -# - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512 - -target = tvm.target.Target("llvm") -N = L = M = 1024 -task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target) - -# Inspect the computational graph -print("Computational DAG:") -print(task.compute_dag) - -################################################################################ -# Set Parameters for Auto-Scheduler -# --------------------------------- -# Next, we set parameters for the auto-scheduler. -# -# * :code:`num_measure_trials` is the number of measurement trials we can use -# during the search. We only make 10 trials in this tutorial for a fast -# demonstration. In practice, 1000 is a good value for the search to converge. -# You can do more trials according to your time budget. -# * In addition, we use :code:`RecordToFile` to log measurement records into a -# file `matmul.json`. The measurement records can be used to query the history -# best, resume the search, and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions` for more parameters - -log_file = "matmul.json" -tune_option = auto_scheduler.TuningOptions( - num_measure_trials=10, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - verbose=2, -) - -################################################################################ -# Run the search -# -------------- -# Now we get all inputs ready. Pretty simple, isn't it? We can kick off the -# search and let the auto-scheduler do its magic. After some measurement -# trials, we can load the best schedule from the log file and apply it. - -# Run auto-tuning (search) -task.tune(tune_option) -# Apply the best schedule -sch, args = task.apply_best(log_file) - -################################################################################ -# Inspecting the Optimized Schedule -# --------------------------------- -# We can lower the schedule to see the IR after auto-scheduling. The -# auto-scheduler correctly performs optimizations including multi-level tiling, -# layout transformation, parallelization, vectorization, unrolling, and -# operator fusion. - -print("Lowered TIR:") -print(tvm.lower(sch, args, simple_mode=True)) - -################################################################################ -# Check correctness and evaluate performance -# ------------------------------------------ -# We build the binary and check its correctness and performance. - -func = tvm.build(sch, args, target) -a_np = np.random.uniform(size=(N, L)).astype(np.float32) -b_np = np.random.uniform(size=(L, M)).astype(np.float32) -c_np = np.random.uniform(size=(N, M)).astype(np.float32) -out_np = a_np.dot(b_np) + c_np - -dev = tvm.cpu() -a_tvm = tvm.nd.array(a_np, device=dev) -b_tvm = tvm.nd.array(b_np, device=dev) -c_tvm = tvm.nd.array(c_np, device=dev) -out_tvm = tvm.nd.empty(out_np.shape, device=dev) -func(a_tvm, b_tvm, c_tvm, out_tvm) - -# Check results -np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3) - -# Evaluate execution time. -evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) -print( - "Execution time of this operator: %.3f ms" - % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000) -) - - -################################################################################ -# Using the record file -# --------------------- -# During the search, all measurement records are logged into the record file -# "matmul.json". The measurement records can be used to re-apply search -# results, resume the search, and perform other analyses. -# -# Here is an example where we load the best schedule from a file, and print the -# equivalent python schedule API. This can be used for debugging and learning -# the behavior of the auto-scheduler. - -print("Equivalent python schedule:") -print(task.print_best(log_file)) - -################################################################################ -# A more complicated example is to resume the search. In this case, we need to -# create the search policy and cost model by ourselves and resume the status of -# search policy and cost model with the log file. In the example below we -# resume the status and do more 5 trials. - - -def resume_search(task, log_file): - print("Resume search:") - cost_model = auto_scheduler.XGBModel() - cost_model.update_from_file(log_file) - search_policy = auto_scheduler.SketchPolicy( - task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] - ) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)] - ) - task.tune(tune_option, search_policy=search_policy) - - -resume_search(task, log_file) - -################################################################################ -# Final Notes and Summary -# ----------------------- -# In this tutorial, we have shown how to use the TVM Auto-Scheduler to -# automatically optimize a matrix multiplication, without the need to specify a -# search template. It ends a series of examples that starts from the Tensor -# Expression (TE) language that demonstrates how TVM can optimize computational -# operations. diff --git a/tutorials/get_started/autotvm_matmul_x86.py b/tutorials/get_started/autotvm_matmul_x86.py deleted file mode 100644 index f9b33b894192..000000000000 --- a/tutorials/get_started/autotvm_matmul_x86.py +++ /dev/null @@ -1,377 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-autotvm-matmul-x86: - -Optimizing Operators with Schedule Templates and AutoTVM -======================================================== -**Authors**: -`Lianmin Zheng `_, -`Chris Hoge `_ - -In this tutorial, we show how the TVM Tensor Expression (TE) language -can be used to write schedule templates that can be searched by AutoTVM to -find the optimal schedule. This process is called Auto-Tuning, which helps -automate the process of optimizing tensor computation. - -This tutorial builds on the previous `tutorial on how to write a matrix -multiplication using TE `. - -There are two steps in auto-tuning. - -- The first step is defining a search space. -- The second step is running a search algorithm to explore through this space. - -In this tutorial, you can learn how to perform these two steps in TVM. The whole -workflow is illustrated by a matrix multiplication example. - -.. note:: - Note that this tutorial will not run on Windows or recent versions of macOS. - To get it to run, you will need to wrap the body of this tutorial in a - :code:`if __name__ == "__main__":` block. -""" - -################################################################################ -# Install dependencies -# -------------------- -# To use autotvm package in TVM, we need to install some extra dependencies. -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost cloudpickle -# -# To make TVM run faster in tuning, it is recommended to use cython as FFI of -# TVM. In the root directory of TVM, execute: -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Begin by importing the required packages. - -import logging -import sys - -import numpy as np -import tvm -from tvm import te -import tvm.testing - -# the module is called `autotvm` -from tvm import autotvm - -################################################################################ -# Basic Matrix Multiplication with TE -# ----------------------------------- -# Recall the basic implementation of matrix multiplication using TE. We write -# it down here with a few changes. We will wrap the multiplication in a python -# function definition. For simplicity, we will focus our attention on a split -# optimization, using a fixed value that defines the block size of the -# reordering. - - -def matmul_basic(N, L, M, dtype): - - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - yo, yi = s[C].split(y, 8) - xo, xi = s[C].split(x, 8) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -################################################################################ -# Matrix Multiplication with AutoTVM -# ---------------------------------- -# In the previous schedule code, we use a constant "8" as the tiling factor. -# However, it might not be the best one because the best tiling factor depends -# on real hardware environment and input shape. -# -# If you want the schedule code to be portable across a wider range of input -# shapes and target hardware, it is better to define a set of candidate values -# and pick the best one according to the measurement results on target -# hardware. -# -# In autotvm, we can define a tunable parameter, or a "knob" for such kind of -# value. - -################################################################################ -# A Basic Matrix Multiplication Template -# -------------------------------------- -# We begin with an example of how to create a tunable parameter set for the -# block size of the `split` scheduling operation. - -# Matmul V1: List candidate values -@autotvm.template("tutorial/matmul_v1") # 1. use a decorator -def matmul_v1(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - # 2. get the config object - cfg = autotvm.get_config() - - # 3. define search space - cfg.define_knob("tile_y", [1, 2, 4, 8, 16]) - cfg.define_knob("tile_x", [1, 2, 4, 8, 16]) - - # 4. schedule according to config - yo, yi = s[C].split(y, cfg["tile_y"].val) - xo, xi = s[C].split(x, cfg["tile_x"].val) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -################################################################################ -# Here we make four modifications to the previous schedule code and get a -# tunable "template". We can explain the modifications one by one. -# -# 1. Use a decorator to mark this function as a simple template. -# 2. Get a config object: You can regard this :code:`cfg` as an argument of -# this function but we obtain it in a different way. With this argument, this -# function is no longer a deterministic schedule. Instead, we can pass -# different configurations to this function and get different schedules. A -# function that uses a configuration object like this is called a "template". -# -# To make the template function more compact, we can do two things to define -# the parameter search space within a single function. -# -# 1. Define a search space across a set values. This is done by making -# :code:`cfg` a :any:`ConfigSpace` object. It will collect all of the -# tunable knobs in this function and build a search space from it. -# 2. Schedule according to an entity in this space. This is done by making -# :code:`cfg` a :any:`ConfigEntity` object. When it is a -# :any:`ConfigEntity`, it will ignore all space definition API (namely, -# :code:`cfg.define_XXXXX(...)`). Instead, it will store deterministic -# values for all tunable knobs, and we schedule according to these values. -# -# During auto-tuning, we will first call this template with a -# :any:`ConfigSpace` object to build the search space. Then we call this -# template with different :any:`ConfigEntity` in the built space to get -# different schedules. Finally we will measure the code generated by -# different schedules and pick the best one. -# -# 3. Define two tunable knobs. The first one is :code:`tile_y` with 5 possible -# values. The second one is :code:`tile_x` with a same list of possible values. -# These two knobs are independent, so they span a search space with size 25 = -# 5x5. -# 4. The configuration knobs are passed to the :code:`split` schedule -# operation, allowing us to schedule according to the 5x5 deterministic values -# we previously defined in :code:`cfg`. - -################################################################################ -# A Matrix Multiplication Template with the Advanced Parameter API -# ---------------------------------------------------------------- -# In the previous template, we manually listed all of the possible values for a -# knob. This is the lowest level API to define the space, and gives an explicit -# enumeration of the parameter space to search. However, we also provide -# another set of APIs that can make the definition of the search space easier -# and smarter. Where possible, we receomment you use this higher-level API -# -# In the following example, we use :any:`ConfigSpace.define_split` to define a -# split knob. It will enumerate all the possible ways to split an axis and -# construct the space. -# -# We also have :any:`ConfigSpace.define_reorder` for reorder knob and -# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization, -# thread binding. When the high level API cannot meet your requirements, you -# can always fall back to using the low level API. - - -@autotvm.template("tutorial/matmul") -def matmul(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - ##### define space begin ##### - cfg = autotvm.get_config() - cfg.define_split("tile_y", y, num_outputs=2) - cfg.define_split("tile_x", x, num_outputs=2) - ##### define space end ##### - - # schedule according to config - yo, yi = cfg["tile_y"].apply(s, C, y) - xo, xi = cfg["tile_x"].apply(s, C, x) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -################################################################################ -# .. note:: More Explanation on :code:`cfg.define_split` -# -# In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will -# enumerate all possible combinations that can split axis y into two axes with -# factors of the length of y. For example, if the length of y is 32 and we -# want to split it into two axes using factors of 32, then there are 6 -# possible values for (length of outer axis, length of inner axis) pair, -# namely (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). These are all 6 -# possible values of `tile_y`. -# -# During scheduling, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object. -# We stores the lengths of outer axes and inner axes in -# :code:`cfg['tile_y'].size` (a tuple with two elements). In this template, -# we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`. -# Actually, this is equivalent to :code:`yo, yi = s[C].split(y, -# cfg["tile_y"].size[1])` or :code:`yo, yi = s[C].split(y, -# nparts=cfg['tile_y"].size[0])` -# -# The advantage of using cfg.apply API is that it makes multi-level splits -# (that is, when num_outputs >= 3) easier. - -################################################################################ -# Step 2: Use AutoTVM to Optimize the Matrix Multiplication -# --------------------------------------------------------- -# In Step 1, we wrote a matrix multiplication template that allowed us to -# paramaterize the block size used in the `split` schedule. We can now conduct -# a search over this parameter space. The next step is to pick a tuner to guide -# the exploration of this space. -# -# Auto-tuners in TVM -# ~~~~~~~~~~~~~~~~~~ -# The job for a tuner can be described by following pseudo code -# -# .. code-block:: c -# -# ct = 0 -# while ct < max_number_of_trials: -# propose a batch of configs -# measure this batch of configs on real hardware and get results -# ct += batch_size -# -# When proposing the next batch of configs, the tuner can take different -# strategies. Some of the tuner strategies provided by TVM include: -# -# * :any:`tvm.autotvm.tuner.RandomTuner`: Enumerate the space in a random order -# * :any:`tvm.autotvm.tuner.GridSearchTuner`: Enumerate the space in a grid search order -# * :any:`tvm.autotvm.tuner.GATuner`: Using genetic algorithm to search through the space -# * :any:`tvm.autotvm.tuner.XGBTuner`: Uses a model based method. Train a XGBoost model to -# predict the speed of lowered IR and pick the next batch according to the -# prediction. -# -# You can choose the tuner according to the size of your space, your time -# budget and other factors. For example, if your space is very small (less -# than 1000), a gridsearch tuner or a random tuner is good enough. If your -# space is at the level of 10^9 (this is the space size of a conv2d operator on -# CUDA GPU), XGBoostTuner can explore more efficiently and find better configs. - -################################################################################ -# Begin tuning -# ~~~~~~~~~~~~ -# Here we continue our matrix multiplication example. First we create a tuning -# task. We can also inspect the initialized search space. In this case, for a -# 512x512 square matrix multiplication, the space size is 10x10=100 Note that -# the task and search space are independent of the tuner picked. - -N, L, M = 512, 512, 512 -task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm") -print(task.config_space) - -################################################################################ -# Then we need to define how to measure the generated code and pick a tuner. -# Since our space is small, a random tuner is just okay. -# -# We only make 10 trials in this tutorial for demonstration. In practice, you -# can do more trials according to your time budget. We will log the tuning -# results into a log file. This file can be used to choose the best -# configuration discovered by the tuner later. - -# logging config (for printing tuning log to the screen) -logging.getLogger("autotvm").setLevel(logging.DEBUG) -logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) - -################################################################################ -# There are two steps for measuring a config: build and run. By default, we use -# all CPU cores to compile program. We then measure them sequentially. To help -# reduce variance, we take 5 measurements and average them. -measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5)) - -# Begin tuning with RandomTuner, log records to file `matmul.log` -# You can use alternatives like XGBTuner. -tuner = autotvm.tuner.RandomTuner(task) -tuner.tune( - n_trial=10, - measure_option=measure_option, - callbacks=[autotvm.callback.log_to_file("matmul.log")], -) - -################################################################################ -# With tuning completed, we can choose the configuration from the log file that -# has the best measured performance and compile the schedule with the -# corresponding parameters. We also do a quick verfication that the schedule is -# producing correct answers. We can call the function :code:`matmul` directly -# under the :any:`autotvm.apply_history_best` context. When we call this -# function, it will query the dispatch context with its argument and get the -# best config with the same argument. - -# apply history best from log file -with autotvm.apply_history_best("matmul.log"): - with tvm.target.Target("llvm"): - s, arg_bufs = matmul(N, L, M, "float32") - func = tvm.build(s, arg_bufs) - -# check correctness -a_np = np.random.uniform(size=(N, L)).astype(np.float32) -b_np = np.random.uniform(size=(L, M)).astype(np.float32) -c_np = a_np.dot(b_np) - -c_tvm = tvm.nd.empty(c_np.shape) -func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) - -tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-4) - -################################################################################ -# Final Notes and Summary -# ----------------------- -# In this tutorial, we have shown how to build operator templates that allow -# TVM to search a parameter space and choose optimized schedule configurations. -# To gain a deeper understanding of how this works, we recommend expanding on -# this example by adding new search parameters to the schedule based on -# schedule operations demonstated in the `Getting Started With Tensor -# Expressions _` tutorial. In the upcoming sections, we -# will demonstate the AutoScheduler, a method for TVM to optimize common -# operators without the need for the user to provide a user-defined template. diff --git a/tutorials/get_started/autotvm_relay_x86.py b/tutorials/get_started/autotvm_relay_x86.py deleted file mode 100644 index 67faec4505a6..000000000000 --- a/tutorials/get_started/autotvm_relay_x86.py +++ /dev/null @@ -1,476 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compiling and Optimizing a Model with the Python Interface (AutoTVM) -==================================================================== -**Author**: -`Chris Hoge `_ - -In the `TVMC Tutorial `_, we covered how to compile, run, and tune a -pre-trained vision model, ResNet-50-v2 using the command line interface for -TVM, TVMC. TVM is more that just a command-line tool though, it is an -optimizing framework with APIs available for a number of different languages -that gives you tremendous flexibility in working with machine learning models. - -In this tutorial we will cover the same ground we did with TVMC, but show how -it is done with the Python API. Upon completion of this section, we will have -used the Python API for TVM to accomplish the following tasks: - -* Compile a pre-trained ResNet 50 v2 model for the TVM runtime. -* Run a real image through the compiled model, and interpret the output and model - performance. -* Tune the model that model on a CPU using TVM. -* Re-compile an optimized model using the tuning data collected by TVM. -* Run the image through the optimized model, and compare the output and model - performance. - -The goal of this section is to give you an overview of TVM's capabilites and -how to use them through the Python API. -""" - -################################################################################ -# TVM is a deep learning compiler framework, with a number of different modules -# available for working with deep learning models and operators. In this -# tutorial we will work through how to load, compile, and optimize a model -# using the Python API. -# -# We begin by importing a number of dependencies, including ``onnx`` for -# loading and converting the model, helper utilities for downloading test data, -# the Python Image Library for working with the image data, ``numpy`` for pre -# and post-processing of the image data, the TVM Relay framework, and the TVM -# Graph Executor. - -import onnx -from tvm.contrib.download import download_testdata -from PIL import Image -import numpy as np -import tvm.relay as relay -import tvm -from tvm.contrib import graph_executor - -################################################################################ -# Downloading and Loading the ONNX Model -# -------------------------------------- -# -# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a -# convolutional neural network that is 50-layers deep and designed to classify -# images. The model we will be using has been pre-trained on more than a -# million images with 1000 different classifications. The network has an input -# image size of 224x224. If you are interested exploring more of how the -# ResNet-50 model is structured, we recommend downloading -# `Netron `_, a freely available ML model viewer. -# -# TVM provides a helper library to download pre-trained models. By providing a -# model URL, file name, and model type through the module, TVM will download -# the model and save it to disk. For the instance of an ONNX model, you can -# then load it into memory using the ONNX runtime. -# -# .. note:: Working with Other Model Formats -# -# TVM supports many popular model formats. A list can be found in the `Compile -# Deep Learning Models -# `_ -# section of the TVM Documentation. - -model_url = "".join( - [ - "https://github.com/onnx/models/raw/", - "master/vision/classification/resnet/model/", - "resnet50-v2-7.onnx", - ] -) - -model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx") -onnx_model = onnx.load(model_path) - -################################################################################ -# Downloading, Preprocessing, and Loading the Test Image -# ------------------------------------------------------ -# -# Each model is particular when it comes to expected tensor shapes, formats and -# data types. For this reason, most models require some pre and -# post-processing, to ensure the input is valid and to interpret the output. -# TVMC has adopted NumPy's ``.npz`` format for both input and output data. -# -# As input for this tutorial, we will use the image of a cat, but you can feel -# free to substitute image for any of your choosing. -# -# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg -# :height: 224px -# :width: 224px -# :align: center -# -# Download the image data, then convert it to a numpy array to use as an input to the model. - -img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" -img_path = download_testdata(img_url, "imagenet_cat.png", module="data") - -# Resize it to 224x224 -resized_image = Image.open(img_path).resize((224, 224)) -img_data = np.asarray(resized_image).astype("float32") - -# Our input image is in HWC layout while ONNX expects CHW input, so convert the array -img_data = np.transpose(img_data, (2, 0, 1)) - -# Normalize according to the ImageNet input specification -imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) -imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) -norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev - -# Add the batch dimension, as we are expecting 4-dimensional input: NCHW. -img_data = np.expand_dims(norm_img_data, axis=0) - -############################################################################### -# Compile the Model With Relay -# ---------------------------- -# -# The next step is to compile the ResNet model. We begin by importing the model -# to relay using the `from_onnx` importer. We then build the model, with -# standard optimizations, into a TVM library. Finally, we create a TVM graph -# runtime module from the library. - -target = "llvm" - -###################################################################### -# .. note:: Defining the Correct Target -# -# Specifying the correct target can have a huge impact on the performance of -# the compiled module, as it can take advantage of hardware features -# available on the target. For more information, please refer to `Auto-tuning -# a convolutional network for x86 CPU -# `_. -# We recommend identifying which CPU you are running, along with optional -# features, and set the target appropriately. For example, for some -# processors ``target = "llvm -mcpu=skylake"``, or ``target = "llvm -# -mcpu=skylake-avx512"`` for processors with the AVX-512 vector instruction -# set. -# - -# The input name may vary across model types. You can use a tool -# like netron to check input names -input_name = "data" -shape_dict = {input_name: img_data.shape} - -mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) - -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - -dev = tvm.device(str(target), 0) -module = graph_executor.GraphModule(lib["default"](dev)) - -###################################################################### -# Execute on the TVM Runtime -# -------------------------- -# Now that we've compiled the model, we can use the TVM runtime to make -# predictions with it. To use TVM to run the model and make predictions, we -# need two things: -# -# - The compiled model, which we just produced. -# - Valid input to the model to make predictions on. - -dtype = "float32" -module.set_input(input_name, img_data) -module.run() -output_shape = (1, 1000) -tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy() - -################################################################################ -# Collect Basic Performance Data -# ------------------------------ -# We want to collect some basic performance data associated with this -# unoptimized model and compare it to a tuned model later. To help account for -# CPU noise, we run the computation in multiple batches in multiple -# repetitions, then gather some basis statistics on the mean, median, and -# standard deviation. -import timeit - -timing_number = 10 -timing_repeat = 10 -unoptimized = ( - np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number)) - * 1000 - / timing_number -) -unoptimized = { - "mean": np.mean(unoptimized), - "median": np.median(unoptimized), - "std": np.std(unoptimized), -} - -print(unoptimized) - -################################################################################ -# Postprocess the output -# ---------------------- -# -# As previously mentioned, each model will have its own particular way of -# providing output tensors. -# -# In our case, we need to run some post-processing to render the outputs from -# ResNet-50-V2 into a more human-readable form, using the lookup-table provided -# for the model. - -from scipy.special import softmax - -# Download a list of labels -labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" -labels_path = download_testdata(labels_url, "synset.txt", module="data") - -with open(labels_path, "r") as f: - labels = [l.rstrip() for l in f] - -# Open the output and read the output tensor -scores = softmax(tvm_output) -scores = np.squeeze(scores) -ranks = np.argsort(scores)[::-1] -for rank in ranks[0:5]: - print("class='%s' with probability=%f" % (labels[rank], scores[rank])) - -################################################################################ -# This should produce the following output: -# -# .. code-block:: bash -# -# # class='n02123045 tabby, tabby cat' with probability=0.610553 -# # class='n02123159 tiger cat' with probability=0.367179 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 - -################################################################################ -# Tune the model -# -------------- -# The previous model was compiled to work on the TVM runtime, but did not -# include any platform specific optimization. In this section, we will show you -# how to build an optimized model using TVM to target your working platform. -# -# In some cases, we might not get the expected performance when running -# inferences using our compiled module. In cases like this, we can make use of -# the auto-tuner, to find a better configuration for our model and get a boost -# in performance. Tuning in TVM refers to the process by which a model is -# optimized to run faster on a given target. This differs from training or -# fine-tuning in that it does not affect the accuracy of the model, but only -# the runtime performance. As part of the tuning process, TVM will try running -# many different operator implementation variants to see which perform best. -# The results of these runs are stored in a tuning records file. -# -# In the simplest form, tuning requires you to provide three things: -# -# - the target specification of the device you intend to run this model on -# - the path to an output file in which the tuning records will be stored -# - a path to the model to be tuned. -# - -import tvm.auto_scheduler as auto_scheduler -from tvm.autotvm.tuner import XGBTuner -from tvm import autotvm - -# Set up some basic parameters for the runner. The runner takes compiled code -# that is generated with a specific set of parameters and measures the -# performance of it. ``number`` specifies the number of different -# configurations that we will test, while ``repeat`` specifies how many -# measurements we will take of each configuration. ``min_repeat_ms`` is a value -# that specifies how long need to run configuration test. If the number of -# repeats falls under this time, it will be increased. This option is necessary -# for accurate tuning on GPUs, and is not required for CPU tuning. Setting this -# value to 0 disables it. The ``timeout`` places an upper limit on how long to -# run training code for each tested configuration. - -number = 10 -repeat = 1 -min_repeat_ms = 0 # since we're tuning on a CPU, can be set to 0 -timeout = 10 # in seconds - -# create a TVM runner -runner = autotvm.LocalRunner( - number=number, - repeat=repeat, - timeout=timeout, - min_repeat_ms=min_repeat_ms, - enable_cpu_cache_flush=True, -) - -# Create a simple structure for holding tuning options. We use an XGBoost -# algorithim for guiding the search. For a production job, you will want to set -# the number of trials to be larger than the value of 10 used here. For CPU we -# recommend 1500, for GPU 3000-4000. The number of trials required can depend -# on the particular model and processor, so it's worth spending some time -# evaluating performance across a range of values to find the best balance -# between tuning time and model optimization. Because running tuning is time -# intensive we set number of trials to 10, but do not recommend a value this -# small. The ``early_stopping`` parameter is the minimum number of trails to -# run before a condition that stops the search early can be applied. The -# measure option indicates where trial code will be built, and where it will be -# run. In this case, we're using the ``LocalRunner`` we just created and a -# ``LocalBuilder``. The ``tuning_records`` option specifies a file to write -# the tuning data to. - -tuning_option = { - "tuner": "xgb", - "trials": 10, - "early_stopping": 100, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(build_func="default"), runner=runner - ), - "tuning_records": "resnet-50-v2-autotuning.json", -} - -################################################################################ -# .. note:: Defining the Tuning Search Algorithm -# -# By default this search is guided using an `XGBoost Grid` algorithm. -# Depending on your model complexity and amount of time available, you might -# want to choose a different algorithm. - - -################################################################################ -# .. note:: Setting Tuning Parameters -# -# In this example, in the interest of time, we set the number of trials and -# early stopping to 10. You will likely see more performance improvements if -# you set these values to be higher but this comes at the expense of time -# spent tuning. The number of trials required for convergence will vary -# depending on the specifics of the model and the target platform. - -# begin by extracting the taks from the onnx model -tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params) - -# Tune the extracted tasks sequentially. -for i, task in enumerate(tasks): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - tuner_obj = XGBTuner(task, loss_type="rank") - tuner_obj.tune( - n_trial=min(tuning_option["trials"], len(task.config_space)), - early_stopping=tuning_option["early_stopping"], - measure_option=tuning_option["measure_option"], - callbacks=[ - autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix), - autotvm.callback.log_to_file(tuning_option["tuning_records"]), - ], - ) - -################################################################################ -# The output from this tuning process will look something like this: -# -# .. code-block:: bash -# -# # [Task 1/24] Current/Best: 10.71/ 21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done. -# # [Task 1/24] Current/Best: 9.32/ 24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done. -# # [Task 2/24] Current/Best: 22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done. -# # [Task 3/24] Current/Best: 32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done. -# # [Task 4/24] Current/Best: 11.96/ 156.49 GFLOPS | Progress: (960/1000) | 632.26 s Done. -# # [Task 5/24] Current/Best: 23.75/ 130.78 GFLOPS | Progress: (800/1000) | 739.29 s Done. -# # [Task 6/24] Current/Best: 38.29/ 198.31 GFLOPS | Progress: (1000/1000) | 624.51 s Done. -# # [Task 7/24] Current/Best: 4.31/ 210.78 GFLOPS | Progress: (1000/1000) | 701.03 s Done. -# # [Task 8/24] Current/Best: 50.25/ 185.35 GFLOPS | Progress: (972/1000) | 538.55 s Done. -# # [Task 9/24] Current/Best: 50.19/ 194.42 GFLOPS | Progress: (1000/1000) | 487.30 s Done. -# # [Task 10/24] Current/Best: 12.90/ 172.60 GFLOPS | Progress: (972/1000) | 607.32 s Done. -# # [Task 11/24] Current/Best: 62.71/ 203.46 GFLOPS | Progress: (1000/1000) | 581.92 s Done. -# # [Task 12/24] Current/Best: 36.79/ 224.71 GFLOPS | Progress: (1000/1000) | 675.13 s Done. -# # [Task 13/24] Current/Best: 7.76/ 219.72 GFLOPS | Progress: (1000/1000) | 519.06 s Done. -# # [Task 14/24] Current/Best: 12.26/ 202.42 GFLOPS | Progress: (1000/1000) | 514.30 s Done. -# # [Task 15/24] Current/Best: 31.59/ 197.61 GFLOPS | Progress: (1000/1000) | 558.54 s Done. -# # [Task 16/24] Current/Best: 31.63/ 206.08 GFLOPS | Progress: (1000/1000) | 708.36 s Done. -# # [Task 17/24] Current/Best: 41.18/ 204.45 GFLOPS | Progress: (1000/1000) | 736.08 s Done. -# # [Task 18/24] Current/Best: 15.85/ 222.38 GFLOPS | Progress: (980/1000) | 516.73 s Done. -# # [Task 19/24] Current/Best: 15.78/ 203.41 GFLOPS | Progress: (1000/1000) | 587.13 s Done. -# # [Task 20/24] Current/Best: 30.47/ 205.92 GFLOPS | Progress: (980/1000) | 471.00 s Done. -# # [Task 21/24] Current/Best: 46.91/ 227.99 GFLOPS | Progress: (308/1000) | 219.18 s Done. -# # [Task 22/24] Current/Best: 13.33/ 207.66 GFLOPS | Progress: (1000/1000) | 761.74 s Done. -# # [Task 23/24] Current/Best: 53.29/ 192.98 GFLOPS | Progress: (1000/1000) | 799.90 s Done. -# # [Task 24/24] Current/Best: 25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done. - -################################################################################ -# Compiling an Optimized Model with Tuning Data -# ---------------------------------------------- -# -# As an output of the tuning process above, we obtained the tuning records -# stored in ``resnet-50-v2-autotuning.json``. The compiler will use the results to -# generate high performance code for the model on your specified target. -# -# Now that tuning data for the model has been collected, we can re-compile the -# model using optimized operators to speed up our computations. - -with autotvm.apply_history_best(tuning_option["tuning_records"]): - with tvm.transform.PassContext(opt_level=3, config={}): - lib = relay.build(mod, target=target, params=params) - -dev = tvm.device(str(target), 0) -module = graph_executor.GraphModule(lib["default"](dev)) - -################################################################################ -# Verify that the optimized model runs and produces the same results: - -dtype = "float32" -module.set_input(input_name, img_data) -module.run() -output_shape = (1, 1000) -tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy() - -scores = softmax(tvm_output) -scores = np.squeeze(scores) -ranks = np.argsort(scores)[::-1] -for rank in ranks[0:5]: - print("class='%s' with probability=%f" % (labels[rank], scores[rank])) - -# Verifying that the predictions are the same: -# -# .. code-block:: bash -# -# # class='n02123045 tabby, tabby cat' with probability=0.610550 -# # class='n02123159 tiger cat' with probability=0.367181 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 - -################################################################################ -# Comparing the Tuned and Untuned Models -# -------------------------------------- -# We want to collect some basic performance data associated with this optimized -# model to compare it to the unoptimized model. Depending on your underlying -# hardware, number of iterations, and other factors, you should see a performance -# improvement in comparing the optimized model to the unoptimized model. - -import timeit - -timing_number = 10 -timing_repeat = 10 -optimized = ( - np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number)) - * 1000 - / timing_number -) -optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)} - - -print("optimized: %s" % (optimized)) -print("unoptimized: %s" % (unoptimized)) - -################################################################################ -# Final Remarks -# ------------- -# -# In this tutorial, we gave a short example of how to use the TVM Python API -# to compile, run, and tune a model. We also discussed the need for pre and -# post-processing of inputs and outputs. After the tuning process, we -# demonstrated how to compare the performance of the unoptimized and optimize -# models. -# -# Here we presented a simple example using ResNet 50 V2 locally. However, TVM -# supports many more features including cross-compilation, remote execution and -# profiling/benchmarking. diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/tutorials/get_started/cross_compilation_and_rpc.py deleted file mode 100644 index 25208369f74d..000000000000 --- a/tutorials/get_started/cross_compilation_and_rpc.py +++ /dev/null @@ -1,265 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-cross-compilation-and-rpc: - -Cross Compilation and RPC -========================= -**Author**: `Ziheng Jiang `_, `Lianmin Zheng `_ - -This tutorial introduces cross compilation and remote device -execution with RPC in TVM. - -With cross compilation and RPC, you can **compile a program on your -local machine then run it on the remote device**. It is useful when -the remote device resource are limited, like Raspberry Pi and mobile -platforms. In this tutorial, we will use the Raspberry Pi for a CPU example -and the Firefly-RK3399 for an OpenCL example. -""" - -###################################################################### -# Build TVM Runtime on Device -# --------------------------- -# -# The first step is to build the TVM runtime on the remote device. -# -# .. note:: -# -# All instructions in both this section and the next section should be -# executed on the target device, e.g. Raspberry Pi. We assume the target -# is running Linux. -# -# Since we do compilation on the local machine, the remote device is only used -# for running the generated code. We only need to build the TVM runtime on -# the remote device. -# -# .. code-block:: bash -# -# git clone --recursive https://github.com/apache/tvm tvm -# cd tvm -# make runtime -j2 -# -# After building the runtime successfully, we need to set environment variables -# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc` -# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM -# directory is in :code:`~/tvm`): -# -# .. code-block:: bash -# -# export PYTHONPATH=$PYTHONPATH:~/tvm/python -# -# To update the environment variables, execute :code:`source ~/.bashrc`. - -###################################################################### -# Set Up RPC Server on Device -# --------------------------- -# To start an RPC server, run the following command on your remote device -# (Which is Raspberry Pi in this example). -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090 -# -# If you see the line below, it means the RPC server started -# successfully on your device. -# -# .. code-block:: bash -# -# INFO:root:RPCServer: bind to 0.0.0.0:9090 -# - -###################################################################### -# Declare and Cross Compile Kernel on Local Machine -# ------------------------------------------------- -# -# .. note:: -# -# Now we go back to the local machine, which has a full TVM installed -# (with LLVM). -# -# Here we will declare a simple kernel on the local machine: - -import numpy as np - -import tvm -from tvm import te -from tvm import rpc -from tvm.contrib import utils - -n = tvm.runtime.convert(1024) -A = te.placeholder((n,), name="A") -B = te.compute((n,), lambda i: A[i] + 1.0, name="B") -s = te.create_schedule(B.op) - -###################################################################### -# Then we cross compile the kernel. -# The target should be 'llvm -mtriple=armv7l-linux-gnueabihf' for -# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable -# on our webpage building server. See the detailed note in the following block. - -local_demo = True - -if local_demo: - target = "llvm" -else: - target = "llvm -mtriple=armv7l-linux-gnueabihf" - -func = tvm.build(s, [A, B], target=target, name="add_one") -# save the lib at a local temp folder -temp = utils.tempdir() -path = temp.relpath("lib.tar") -func.export_library(path) - -###################################################################### -# .. note:: -# -# To run this tutorial with a real remote device, change :code:`local_demo` -# to False and replace :code:`target` in :code:`build` with the appropriate -# target triple for your device. The target triple which might be -# different for different devices. For example, it is -# :code:`'llvm -mtriple=armv7l-linux-gnueabihf'` for Raspberry Pi 3B and -# :code:`'llvm -mtriple=aarch64-linux-gnu'` for RK3399. -# -# Usually, you can query the target by running :code:`gcc -v` on your -# device, and looking for the line starting with :code:`Target:` -# (Though it may still be a loose configuration.) -# -# Besides :code:`-mtriple`, you can also set other compilation options -# like: -# -# * -mcpu= -# Specify a specific chip in the current architecture to generate code for. By default this is inferred from the target triple and autodetected to the current architecture. -# * -mattr=a1,+a2,-a3,... -# Override or control specific attributes of the target, such as whether SIMD operations are enabled or not. The default set of attributes is set by the current CPU. -# To get the list of available attributes, you can do: -# -# .. code-block:: bash -# -# llc -mtriple= -mattr=help -# -# These options are consistent with `llc `_. -# It is recommended to set target triple and feature set to contain specific -# feature available, so we can take full advantage of the features of the -# board. -# You can find more details about cross compilation attributes from -# `LLVM guide of cross compilation `_. - -###################################################################### -# Run CPU Kernel Remotely by RPC -# ------------------------------ -# We show how to run the generated CPU kernel on the remote device. -# First we obtain an RPC session from remote device. - -if local_demo: - remote = rpc.LocalSession() -else: - # The following is my environment, change this to the IP address of your target device - host = "10.77.1.162" - port = 9090 - remote = rpc.connect(host, port) - -###################################################################### -# Upload the lib to the remote device, then invoke a device local -# compiler to relink them. Now `func` is a remote module object. - -remote.upload(path) -func = remote.load_module("lib.tar") - -# create arrays on the remote device -dev = remote.cpu() -a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) -b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) -# the function will run on the remote device -func(a, b) -np.testing.assert_equal(b.numpy(), a.numpy() + 1) - -###################################################################### -# When you want to evaluate the performance of the kernel on the remote -# device, it is important to avoid the overhead of network. -# :code:`time_evaluator` will returns a remote function that runs the -# function over number times, measures the cost per run on the remote -# device and returns the measured cost. Network overhead is excluded. - -time_f = func.time_evaluator(func.entry_name, dev, number=10) -cost = time_f(a, b).mean -print("%g secs/op" % cost) - -######################################################################### -# Run OpenCL Kernel Remotely by RPC -# --------------------------------- -# For remote OpenCL devices, the workflow is almost the same as above. -# You can define the kernel, upload files, and run via RPC. -# -# .. note:: -# -# Raspberry Pi does not support OpenCL, the following code is tested on -# Firefly-RK3399. You may follow this `tutorial `_ -# to setup the OS and OpenCL driver for RK3399. -# -# Also we need to build the runtime with OpenCL enabled on rk3399 board. In the TVM -# root directory, execute -# -# .. code-block:: bash -# -# cp cmake/config.cmake . -# sed -i "s/USE_OPENCL OFF/USE_OPENCL ON/" config.cmake -# make runtime -j4 -# -# The following function shows how we run an OpenCL kernel remotely - - -def run_opencl(): - # NOTE: This is the setting for my rk3399 board. You need to modify - # them according to your environment. - opencl_device_host = "10.77.1.145" - opencl_device_port = 9090 - target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu") - - # create schedule for the above "add one" compute declaration - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=32) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - func = tvm.build(s, [A, B], target=target) - - remote = rpc.connect(opencl_device_host, opencl_device_port) - - # export and upload - path = temp.relpath("lib_cl.tar") - func.export_library(path) - remote.upload(path) - func = remote.load_module("lib_cl.tar") - - # run - dev = remote.cl() - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) - func(a, b) - np.testing.assert_equal(b.numpy(), a.numpy() + 1) - print("OpenCL test passed!") - - -###################################################################### -# Summary -# ------- -# This tutorial provides a walk through of cross compilation and RPC -# features in TVM. -# -# - Set up an RPC server on the remote device. -# - Set up the target device configuration to cross compile the kernels on the -# local machine. -# - Upload and run the kernels remotely via the RPC API. diff --git a/tutorials/get_started/install.py b/tutorials/get_started/install.py deleted file mode 100644 index b69b8b493a4f..000000000000 --- a/tutorials/get_started/install.py +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Installing TVM -============== -**Authors**: -`Jocelyn Shiue `_, -`Chris Hoge `_ - -Depending on your needs and your working environment, there are a few different -methods for installing TVM. These include: - -* Installing from source -* Installing from third-party binary package. -""" - -################################################################################ -# Installing from Source -# ---------------------- -# Installing from source is the recommended method for installing TVM. It will -# allow you to enable specific features such as GPU support, microcontroller -# support (microTVM), and a debugging runtime, and other features. You will also -# want to install from source if you want to actively contribute to the TVM -# project. The full instructions are on the `Install TVM From Source -# `_ page. - -################################################################################ -# Installing From Binary Packages -# -------------------------------- -# You may install convenient third-party binary package distributions to -# quickly try things out. TLCPack is a thirdparty volunteer community that -# builds binary packages from TVM source. It offers support matrix with -# instructions to install on different platforms, with different features. -# Checkout `TLCPack `_ to learn more. Note that the -# thirdparty binary packages could contain additional licensing terms for -# the hardware drivers that are bundled with it. diff --git a/tutorials/get_started/introduction.py b/tutorials/get_started/introduction.py deleted file mode 100644 index 0746c3983b61..000000000000 --- a/tutorials/get_started/introduction.py +++ /dev/null @@ -1,134 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Introduction -============ -**Authors**: -`Jocelyn Shiue `_, -`Chris Hoge `_, -`Lianmin Zheng `_ - -Apache TVM is an open source machine learning compiler framework for CPUs, -GPUs, and machine learning accelerators. It aims to enable machine learning -engineers to optimize and run computations efficiently on any hardware backend. -The purpose of this tutorial is to take a guided tour through all of the major -features of TVM by defining and demonstrating key concepts. A new user should -be able to work through the tutorial from start to finish and be able to -operate TVM for automatic model optimization, while having a basic -understanding of the TVM architecture and how it works. - -Contents --------- - -#. :doc:`Introduction ` -#. :doc:`Installing TVM ` -#. :doc:`Compiling and Optimizing a Model with the Command Line Interface ` -#. :doc:`Compiling and Optimizing a Model with the Python Interface ` -#. :doc:`Working with Operators Using Tensor Expression ` -#. :doc:`Optimizing Operators with Templates and AutoTVM ` -#. :doc:`Optimizing Operators with Template-free AutoScheduler ` -#. :doc:`Cross Compilation and Remote Procedure Calls (RPC) ` -#. :doc:`Compiling Deep Learning Models for GPUs ` -""" - -################################################################################ -# An Overview of TVM and Model Optimization -# ========================================= -# -# The diagram below illustrates the steps a machine model takes as it is -# transformed with the TVM optimizing compiler framework. -# -# .. image:: https://raw.githubusercontent.com/apache/tvm-site/main/images/tutorial/overview.png -# :width: 100% -# :alt: A High Level View of TVM -# -# 1. Import the model from a framework like *Tensorflow*, *Pytorch*, or *Onnx*. -# The importer layer is where TVM can ingest models from other frameworks, like -# Tensorflow, PyTorch, or ONNX. The level of support that TVM offers for each -# frontend varies as we are constantly improving the open source project. If -# you're having issues importing your model into TVM, you may want to try -# converting it to ONNX. -# -# 2. Translate to *Relay*, TVM's high-level model language. -# A model that has been imported into TVM is represented in Relay. Relay is a -# functional language and intermediate representation (IR) for neural networks. -# It has support for: -# -# - Traditional data flow-style representations -# - Functional-style scoping, let-binding which makes it a fully featured -# differentiable language -# - Ability to allow the user to mix the two programming styles -# -# Relay applies graph-level optimization passes to optimize the model. -# -# 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a -# higher-level representation is transformed into a lower-level -# representation. After applying the high-level optimizations, Relay -# runs FuseOps pass to partition the model into many small subgraphs and lowers -# the subgraphs to TE representation. Tensor Expression (TE) is a -# domain-specific language for describing tensor computations. -# TE also provides several *schedule* primitives to specify low-level loop -# optimizations, such as tiling, vectorization, parallelization, -# unrolling, and fusion. -# To aid in the process of converting Relay representation into TE representation, -# TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined -# templates of common tensor operators (e.g., conv2d, transpose). -# -# 4. Search for the best schedule using the auto-tuning module *AutoTVM* or *AutoScheduler*. -# A schedule specifies the low-level loop optimizations for an operator or -# subgraph defined in TE. Auto-tuning modules search for the best schedule -# and compare them with cost models and on-device measurements. -# There are two auto-tuning modules in TVM. -# -# - **AutoTVM**: A template-based auto-tuning module. It runs search algorithms -# to find the best values for the tunable knobs in a user-defined template. -# For common operators, their templates are already provided in TOPI. -# - **AutoScheduler (a.k.a. Ansor)**: A template-free auto-tuning module. -# It does not require pre-defined schedule templates. Instead, it generates -# the search space automatically by analyzing the computation definition. -# It then searches for the best schedule in the generated search space. -# -# 5. Choose the optimal configurations for model compilation. After tuning, the -# auto-tuning module generates tuning records in JSON format. This step -# picks the best schedule for each subgraph. -# -# 6. Lower to Tensor Intermediate Representation (TIR), TVM's low-level -# intermediate representation. After selecting the optimal configurations -# based on the tuning step, each TE subgraph is lowered to TIR and be -# optimized by low-level optimization passes. Next, the optimized TIR is -# lowered to the target compiler of the hardware platform. -# This is the final code generation phase to produce an optimized model -# that can be deployed into production. TVM supports several different -# compiler backends including: -# -# - LLVM, which can target arbitrary microprocessor architecture including -# standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any -# other platform supported by LLVM. -# - Specialized compilers, such as NVCC, NVIDIA's compiler. -# - Embedded and specialized targets, which are implemented through TVM's -# Bring Your Own Codegen (BYOC) framework. -# -# 7. Compile down to machine code. At the end of this process, the -# compiler-specific generated code can be lowered to machine code. -# -# TVM can compile models down to a linkable object module, which can then be -# run with a lightweight TVM runtime that provides C APIs to dynamically -# load the model, and entry points for other languages such as Python and -# Rust. TVM can also build a bundled deployment in which the runtime is -# combined with the model in a single package. -# -# The remainder of the tutorial will cover these aspects of TVM in more detail. diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py deleted file mode 100644 index fd7f5aa9d756..000000000000 --- a/tutorials/get_started/relay_quick_start.py +++ /dev/null @@ -1,155 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-relay-quick-start: - -Quick Start Tutorial for Compiling Deep Learning Models -======================================================= -**Author**: `Yao Wang `_, `Truman Tian `_ - -This example shows how to build a neural network with Relay python frontend and -generates a runtime library for Nvidia GPU with TVM. -Notice that you need to build TVM with cuda and llvm enabled. -""" - -###################################################################### -# Overview for Supported Hardware Backend of TVM -# ---------------------------------------------- -# The image below shows hardware backend currently supported by TVM: -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tvm_support_list.png -# :align: center -# -# In this tutorial, we'll choose cuda and llvm as target backends. -# To begin with, let's import Relay and TVM. - -import numpy as np - -from tvm import relay -from tvm.relay import testing -import tvm -from tvm import te -from tvm.contrib import graph_executor -import tvm.testing - -###################################################################### -# Define Neural Network in Relay -# ------------------------------ -# First, let's define a neural network with relay python frontend. -# For simplicity, we'll use pre-defined resnet-18 network in Relay. -# Parameters are initialized with Xavier initializer. -# Relay also supports other model formats such as MXNet, CoreML, ONNX and -# Tensorflow. -# -# In this tutorial, we assume we will do inference on our device and -# the batch size is set to be 1. Input images are RGB color images of -# size 224 * 224. We can call the -# :py:meth:`tvm.relay.expr.TupleWrapper.astext()` to show the network -# structure. - -batch_size = 1 -num_class = 1000 -image_shape = (3, 224, 224) -data_shape = (batch_size,) + image_shape -out_shape = (batch_size, num_class) - -mod, params = relay.testing.resnet.get_workload( - num_layers=18, batch_size=batch_size, image_shape=image_shape -) - -# set show_meta_data=True if you want to show meta data -print(mod.astext(show_meta_data=False)) - -###################################################################### -# Compilation -# ----------- -# Next step is to compile the model using the Relay/TVM pipeline. -# Users can specify the optimization level of the compilation. -# Currently this value can be 0 to 3. The optimization passes include -# operator fusion, pre-computation, layout transformation and so on. -# -# :py:func:`relay.build` returns three components: the execution graph in -# json format, the TVM module library of compiled functions specifically -# for this graph on the target hardware, and the parameter blobs of -# the model. During the compilation, Relay does the graph-level -# optimization while TVM does the tensor-level optimization, resulting -# in an optimized runtime module for model serving. -# -# We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build` -# first does a number of graph-level optimizations, e.g. pruning, fusing, etc., -# then registers the operators (i.e. the nodes of the optimized graphs) to -# TVM implementations to generate a `tvm.module`. -# To generate the module library, TVM will first transfer the high level IR -# into the lower intrinsic IR of the specified target backend, which is CUDA -# in this example. Then the machine code will be generated as the module library. - -opt_level = 3 -target = tvm.target.cuda() -with tvm.transform.PassContext(opt_level=opt_level): - lib = relay.build(mod, target, params=params) - -##################################################################### -# Run the generate library -# ------------------------ -# Now we can create graph executor and run the module on Nvidia GPU. - -# create random input -dev = tvm.cuda() -data = np.random.uniform(-1, 1, size=data_shape).astype("float32") -# create module -module = graph_executor.GraphModule(lib["default"](dev)) -# set input and parameters -module.set_input("data", data) -# run -module.run() -# get output -out = module.get_output(0, tvm.nd.empty(out_shape)).numpy() - -# Print first 10 elements of output -print(out.flatten()[0:10]) - -###################################################################### -# Save and Load Compiled Module -# ----------------------------- -# We can also save the graph, lib and parameters into files and load them -# back in deploy environment. - -#################################################### - -# save the graph, lib and params into separate files -from tvm.contrib import utils - -temp = utils.tempdir() -path_lib = temp.relpath("deploy_lib.tar") -lib.export_library(path_lib) -print(temp.listdir()) - -#################################################### - -# load the module back. -loaded_lib = tvm.runtime.load_module(path_lib) -input_data = tvm.nd.array(data) - -module = graph_executor.GraphModule(loaded_lib["default"](dev)) -module.run(data=input_data) -out_deploy = module.get_output(0).numpy() - -# Print first 10 elements of output -print(out_deploy.flatten()[0:10]) - -# check whether the output from deployed module is consistent with original one -tvm.testing.assert_allclose(out_deploy, out, atol=1e-5) diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py deleted file mode 100644 index 310d6bdbfee4..000000000000 --- a/tutorials/get_started/tensor_expr_get_started.py +++ /dev/null @@ -1,903 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-tensor-expr-get-started: - -Working with Operators Using Tensor Expression -============================================== -**Author**: `Tianqi Chen `_ - -In this tutorial we will turn our attention to how TVM works with Tensor -Expression (TE) to define tensor computations and apply loop optimizations. TE -describes tensor computations in a pure functional language (that is each -expression has no side effects). When viewed in context of the TVM as a whole, -Relay describes a computation as a set of operators, and each of these -operators can be represented as a TE expression where each TE expression takes -input tensors and produces an output tensor. - -This is an introductory tutorial to the Tensor Expression language in TVM. TVM -uses a domain specific tensor expression for efficient kernel construction. We -will demonstrate the basic workflow with two examples of using the tensor expression -language. The first example introduces TE and scheduling with vector -addition. The second expands on these concepts with a step-by-step optimization -of a matrix multiplication with TE. This matrix multiplication example will -serve as the comparative basis for future tutorials covering more advanced -features of TVM. -""" - -################################################################################ -# Example 1: Writing and Scheduling Vector Addition in TE for CPU -# --------------------------------------------------------------- -# -# Let's look at an example in Python in which we will implement a TE for -# vector addition, followed by a schedule targeted towards a CPU. -# We begin by initializing a TVM environment. - -import tvm -import tvm.testing -from tvm import te -import numpy as np - -# You will get better performance if you can identify the CPU you are targeting -# and specify it. If you're using llvm, you can get this information from the -# command ``llc --version`` to get the CPU type, and you can check -# ``/proc/cpuinfo`` for additional extensions that your processor might -# support. For example, you can use "llvm -mcpu=skylake-avx512" for CPUs with -# AVX-512 instructions. - -tgt = tvm.target.Target(target="llvm", host="llvm") - -################################################################################ -# Describing the Vector Computation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We describe a vector addition computation. TVM adopts tensor semantics, with -# each intermediate result represented as a multi-dimensional array. The user -# needs to describe the computation rule that generates the tensors. We first -# define a symbolic variable ``n`` to represent the shape. We then define two -# placeholder Tensors, ``A`` and ``B``, with given shape ``(n,)``. We then -# describe the result tensor ``C``, with a ``compute`` operation. The -# ``compute`` defines a computation, with the output conforming to the -# specified tensor shape and the computation to be performed at each position -# in the tensor defined by the lambda function. Note that while ``n`` is a -# variable, it defines a consistent shape between the ``A``, ``B`` and ``C`` -# tensors. Remember, no actual computation happens during this phase, as we -# are only declaring how the computation should be done. - -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.placeholder((n,), name="B") -C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - -################################################################################ -# .. note:: Lambda Functions -# -# The second argument to the ``te.compute`` method is the function that -# performs the computation. In this example, we're using an anonymous function, -# also known as a ``lambda`` function, to define the computation, in this case -# addition on the ``i``th element of ``A`` and ``B``. - -################################################################################ -# Create a Default Schedule for the Computation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# While the above lines describe the computation rule, we can compute ``C`` in -# many different ways to fit different devices. For a tensor with multiple -# axes, you can choose which axis to iterate over first, or computations can be -# split across different threads. TVM requires that the user to provide a -# schedule, which is a description of how the computation should be performed. -# Scheduling operations within TE can change loop orders, split computations -# across different threads, group blocks of data together, amongst other -# operations. An important concept behind schedules is that they only describe -# how the computation is performed, so different schedules for the same TE will -# produce the same result. -# -# TVM allows you to create a naive schedule that will compute ``C`` in by -# iterating in row major order. -# -# .. code-block:: c -# -# for (int i = 0; i < n; ++i) { -# C[i] = A[i] + B[i]; -# } - -s = te.create_schedule(C.op) - -###################################################################### -# Compile and Evaluate the Default Schedule -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# With the TE expression and a schedule, we can produce runnable code for our -# target language and architecture, in this case LLVM and a CPU. We provide -# TVM with the schedule, a list of the TE expressions that are in the schedule, -# the target and host, and the name of the function we are producing. The result -# of the output is a type-erased function that can be called directly from Python. -# -# In the following line, we use tvm.build to create a function. The build -# function takes the schedule, the desired signature of the function (including -# the inputs and outputs) as well as target language we want to compile to. - -fadd = tvm.build(s, [A, B, C], tgt, name="myadd") - -################################################################################ -# Let's run the function, and compare the output to the same computation in -# numpy. The compiled TVM function is exposes a concise C API that can be invoked -# from any language. We begin by creating a device, which is a device (CPU in this -# example) that TVM can compile the schedule to. In this case the device is an -# LLVM CPU target. We can then initialize the tensors in our device and -# perform the custom addition operation. To verify that the computation is -# correct, we can compare the result of the output of the c tensor to the same -# computation performed by numpy. - -dev = tvm.device(tgt.kind.name, 0) - -n = 1024 -a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) -b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) -c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) -fadd(a, b, c) -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# To get a comparison of how fast this version is compared to numpy, create a -# helper function to run a profile of the TVM generated code. -import timeit - -np_repeat = 100 -np_running_time = timeit.timeit( - setup="import numpy\n" - "n = 32768\n" - 'dtype = "float32"\n' - "a = numpy.random.rand(n, 1).astype(dtype)\n" - "b = numpy.random.rand(n, 1).astype(dtype)\n", - stmt="answer = a + b", - number=np_repeat, -) -print("Numpy running time: %f" % (np_running_time / np_repeat)) - - -def evaluate_addition(func, target, optimization, log): - dev = tvm.device(target.kind.name, 0) - n = 32768 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - - evaluator = func.time_evaluator(func.entry_name, dev, number=10) - mean_time = evaluator(a, b, c).mean - print("%s: %f" % (optimization, mean_time)) - - log.append((optimization, mean_time)) - - -log = [("numpy", np_running_time / np_repeat)] -evaluate_addition(fadd, tgt, "naive", log=log) - -################################################################################ -# Updating the Schedule to Use Paralleism -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now that we've illustrated the fundamentals of TE, let's go deeper into what -# schedules do, and how they can be used to optimize tensor expressions for -# different architectures. A schedule is a series of steps that are applied to -# an expression to transform it in a number of different ways. When a schedule -# is applied to an expression in TE, the inputs and outputs remain the same, -# but when compiled the implementation of the expression can change. This -# tensor addition, in the default schedule, is run serially but is easy to -# parallelize across all of the processor threads. We can apply the parallel -# schedule operation to our computation. - -s[C].parallel(C.op.axis[0]) - -################################################################################ -# The ``tvm.lower`` command will generate the Intermediate Representation (IR) -# of the TE, with the corresponding schedule. By lowering the expression as we -# apply different schedule operations, we can see the effect of scheduling on -# the ordering of the computation. We use the flag ``simple_mode=True`` to -# return a readable C-style statement. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# It's now possible for TVM to run these blocks on independent threads. Let's -# compile and run this new schedule with the parallel operation applied: - -fadd_parallel = tvm.build(s, [A, B, C], tgt, name="myadd_parallel") -fadd_parallel(a, b, c) - -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -evaluate_addition(fadd_parallel, tgt, "parallel", log=log) - -################################################################################ -# Updating the Schedule to Use Vectorization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Modern CPUs also have the ability to perform SIMD operations on floating -# point values, and we can apply another schedule to our computation expression -# to take advantage of this. Accomplishing this requires multiple steps: first -# we have to split the schedule into inner and outer loops using the split -# scheduling primitive. The inner loops can use vectorization to use SIMD -# instructions using the vectorize scheduling primitive, then the outer loops -# can be parallelized using the parallel scheduling primitive. Choose the split -# factor to be the number of threads on your CPU. - -# Recreate the schedule, since we modified it with the parallel operation in -# the previous example -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.placeholder((n,), name="B") -C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - -s = te.create_schedule(C.op) - -# This factor should be chosen to match the number of threads appropriate for -# your CPU. This will vary depending on architecture, but a good rule is -# setting this factor to equal the number of available CPU cores. -factor = 4 - -outer, inner = s[C].split(C.op.axis[0], factor=factor) -s[C].parallel(outer) -s[C].vectorize(inner) - -fadd_vector = tvm.build(s, [A, B, C], tgt, name="myadd_parallel") - -evaluate_addition(fadd_vector, tgt, "vector", log=log) - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Comparing the Diferent Schedules -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We can now compare the different schedules - -baseline = log[0][1] -print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20))) -for result in log: - print( - "%s\t%s\t%s" - % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20)) - ) - - -################################################################################ -# .. note:: Code Specialization -# -# As you may have noticed, the declarations of ``A``, ``B`` and ``C`` all -# take the same shape argument, ``n``. TVM will take advantage of this to -# pass only a single shape argument to the kernel, as you will find in the -# printed device code. This is one form of specialization. -# -# On the host side, TVM will automatically generate check code that checks -# the constraints in the parameters. So if you pass arrays with different -# shapes into fadd, an error will be raised. -# -# We can do more specializations. For example, we can write :code:`n = -# tvm.runtime.convert(1024)` instead of :code:`n = te.var("n")`, in the -# computation declaration. The generated function will only take vectors with -# length 1024. - -################################################################################ -# We've defined, scheduled, and compiled a vector addition operator, which we -# were then able to execute on the TVM runtime. We can save the operator as a -# library, which we can then load later using the TVM runtime. - -################################################################################ -# Targeting Vector Addition for GPUs (Optional) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# TVM is capable of targeting multiple architectures. In the next example, we -# will target compilation of the vector addition to GPUs. - -# If you want to run this code, change ``run_cuda = True`` -# Note that by default this example is not run in the docs CI. - -run_cuda = False -if run_cuda: - # Change this target to the correct backend for you gpu. For example: cuda (NVIDIA GPUs), - # rocm (Radeon GPUS), OpenCL (opencl). - tgt_gpu = tvm.target.Target(target="cuda", host="llvm") - - # Recreate the schedule - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - print(type(C)) - - s = te.create_schedule(C.op) - - bx, tx = s[C].split(C.op.axis[0], factor=64) - - ################################################################################ - # Finally we must bind the iteration axis bx and tx to threads in the GPU - # compute grid. The naive schedule is not valid for GPUs, and these are - # specific constructs that allow us to generate code that runs on a GPU. - - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - - ###################################################################### - # Compilation - # ----------- - # After we have finished specifying the schedule, we can compile it - # into a TVM function. By default TVM compiles into a type-erased - # function that can be directly called from the python side. - # - # In the following line, we use tvm.build to create a function. - # The build function takes the schedule, the desired signature of the - # function (including the inputs and outputs) as well as target language - # we want to compile to. - # - # The result of compilation fadd is a GPU device function (if GPU is - # involved) as well as a host wrapper that calls into the GPU - # function. fadd is the generated host wrapper function, it contains - # a reference to the generated device function internally. - - fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name="myadd") - - ################################################################################ - # The compiled TVM function is exposes a concise C API that can be invoked from - # any language. - # - # We provide a minimal array API in python to aid quick testing and prototyping. - # The array API is based on the `DLPack `_ standard. - # - # - We first create a GPU device. - # - Then tvm.nd.array copies the data to the GPU. - # - ``fadd`` runs the actual computation - # - ``numpy()`` copies the GPU array back to the CPU (so we can verify correctness). - # - # Note that copying the data to and from the memory on the GPU is a required step. - - dev = tvm.device(tgt_gpu.kind.name, 0) - - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - ################################################################################ - # Inspect the Generated GPU Code - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # You can inspect the generated code in TVM. The result of tvm.build is a TVM - # Module. fadd is the host module that contains the host wrapper, it also - # contains a device module for the CUDA (GPU) function. - # - # The following code fetches the device module and prints the content code. - - if ( - tgt_gpu.kind.name == "cuda" - or tgt_gpu.kind.name == "rocm" - or tgt_gpu.kind.name.startswith("opencl") - ): - dev_module = fadd.imported_modules[0] - print("-----GPU code-----") - print(dev_module.get_source()) - else: - print(fadd.get_source()) - -################################################################################ -# Saving and Loading Compiled Modules -# ----------------------------------- -# Besides runtime compilation, we can save the compiled modules into a file and -# load them back later. -# -# The following code first performs the following steps: -# -# - It saves the compiled host module into an object file. -# - Then it saves the device module into a ptx file. -# - cc.create_shared calls a compiler (gcc) to create a shared library - -from tvm.contrib import cc -from tvm.contrib import utils - -temp = utils.tempdir() -fadd.save(temp.relpath("myadd.o")) -if tgt.kind.name == "cuda": - fadd.imported_modules[0].save(temp.relpath("myadd.ptx")) -if tgt.kind.name == "rocm": - fadd.imported_modules[0].save(temp.relpath("myadd.hsaco")) -if tgt.kind.name.startswith("opencl"): - fadd.imported_modules[0].save(temp.relpath("myadd.cl")) -cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")]) -print(temp.listdir()) - -################################################################################ -# .. note:: Module Storage Format -# -# The CPU (host) module is directly saved as a shared library (.so). There -# can be multiple customized formats of the device code. In our example, the -# device code is stored in ptx, as well as a meta data json file. They can be -# loaded and linked separately via import. - -################################################################################ -# Load Compiled Module -# ~~~~~~~~~~~~~~~~~~~~ -# We can load the compiled module from the file system and run the code. The -# following code loads the host and device module separately and links them -# together. We can verify that the newly loaded function works. - -fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so")) -if tgt.kind.name == "cuda": - fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx")) - fadd1.import_module(fadd1_dev) - -if tgt.kind.name == "rocm": - fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco")) - fadd1.import_module(fadd1_dev) - -if tgt.kind.name.startswith("opencl"): - fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl")) - fadd1.import_module(fadd1_dev) - -fadd1(a, b, c) -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# Pack Everything into One Library -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# In the above example, we store the device and host code separately. TVM also -# supports export everything as one shared library. Under the hood, we pack -# the device modules into binary blobs and link them together with the host -# code. Currently we support packing of Metal, OpenCL and CUDA modules. - -fadd.export_library(temp.relpath("myadd_pack.so")) -fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so")) -fadd2(a, b, c) -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# .. note:: Runtime API and Thread-Safety -# -# The compiled modules of TVM do not depend on the TVM compiler. Instead, -# they only depend on a minimum runtime library. The TVM runtime library -# wraps the device drivers and provides thread-safe and device agnostic calls -# into the compiled functions. -# -# This means that you can call the compiled TVM functions from any thread, on -# any GPUs, provided that you have compiled the code for that GPU. - -################################################################################ -# Generate OpenCL Code -# -------------------- -# TVM provides code generation features into multiple backends. We can also -# generate OpenCL code or LLVM code that runs on CPU backends. -# -# The following code blocks generate OpenCL code, creates array on an OpenCL -# device, and verifies the correctness of the code. - -if tgt.kind.name.startswith("opencl"): - fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd") - print("------opencl code------") - print(fadd_cl.imported_modules[0].get_source()) - dev = tvm.cl(0) - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd_cl(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# .. note:: TE Scheduling Primitives -# -# TVM includes a number of different scheduling primitives: -# -# - split: splits a specified axis into two axises by the defined factor. -# - tile: tiles will split a computation across two axes by the defined factors. -# - fuse: fuses two consecutive axises of one computation. -# - reorder: can reorder the axises of a computation into a defined order. -# - bind: can bind a computation to a specific thread, useful in GPU programming. -# - compute_at: by default, TVM will compute tensors at the outermost level -# of the function, or the root, by default. compute_at specifies that one -# tensor should be computed at the first axis of computation for another -# operator. -# - compute_inline: when marked inline, a computation will be expanded then -# inserted into the address where the tensor is required. -# - compute_root: moves a computation to the outermost layer, or root, of the -# function. This means that stage of the computation will be fully computed -# before it moves on to the next stage. -# -# A complete description of these primitives can be found in the -# [Schedule Primitives](https://tvm.apache.org/docs/tutorials/language/schedule_primitives.html) docs page. - -################################################################################ -# Example 2: Manually Optimizing Matrix Multiplication with TE -# ------------------------------------------------------------ -# -# Now we will consider a second, more advanced example, demonstrating how with -# just 18 lines of python code TVM speeds up a common matrix multiplication operation by 18x. -# -# **Matrix multiplication is a compute intensive operation. There are -# two important optimizations for good CPU performance:** -# -# 1. Increase the cache hit rate of memory access. Both complex -# numerical computation and hot-spot memory access can be -# accelerated by a high cache hit rate. This requires us to -# transform the origin memory access pattern to a pattern that fits -# the cache policy. -# -# 2. SIMD (Single instruction multi-data), also known as the vector -# processing unit. On each cycle instead of processing a single -# value, SIMD can process a small batch of data. This requires us -# to transform the data access pattern in the loop body in uniform -# pattern so that the LLVM backend can lower it to SIMD. -# -# The techniques used in this tutorial are a subset of tricks mentioned in this -# `repository `_. Some of them -# have been applied by TVM abstraction automatically, but some of them cannot -# be automatically applied due to TVM constraints. - -################################################################################ -# Preparation and Performance Baseline -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We begin by collecting performance data on the `numpy` implementation of -# matrix multiplication. - -import tvm -import tvm.testing -from tvm import te -import numpy - -# The size of the matrix -# (M, K) x (K, N) -# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL. -M = 1024 -K = 1024 -N = 1024 - -# The default tensor data type in tvm -dtype = "float32" - -# You will want to adjust the target to match any CPU vector extensions you -# might have. For example, if you're using using Intel AVX2 (Advanced Vector -# Extensions) ISA for SIMD, you can get the best performance by changing the -# following line to ``llvm -mcpu=core-avx2``, or specific type of CPU you use. -# Recall that you're using llvm, you can get this information from the command -# ``llc --version`` to get the CPU type, and you can check ``/proc/cpuinfo`` -# for additional extensions that your processor might support. - -target = tvm.target.Target(target="llvm", host="llvm") -dev = tvm.device(target.kind.name, 0) - -# Random generated tensor for testing -a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev) -b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev) - -# Repeatedly perform a matrix multiplication to get a performance baseline -# for the default numpy implementation -np_repeat = 100 -np_running_time = timeit.timeit( - setup="import numpy\n" - "M = " + str(M) + "\n" - "K = " + str(K) + "\n" - "N = " + str(N) + "\n" - 'dtype = "float32"\n' - "a = numpy.random.rand(M, K).astype(dtype)\n" - "b = numpy.random.rand(K, N).astype(dtype)\n", - stmt="answer = numpy.dot(a, b)", - number=np_repeat, -) -print("Numpy running time: %f" % (np_running_time / np_repeat)) - -answer = numpy.dot(a.numpy(), b.numpy()) - -################################################################################ -# Now we write a basic matrix multiplication using TVM TE and verify that it -# produces the same results as the numpy implementation. We also write a -# function that will help us measure the performance of the schedule -# optimizations. - -# TVM Matrix Multiplication using TE -k = te.reduce_axis((0, K), "k") -A = te.placeholder((M, K), name="A") -B = te.placeholder((K, N), name="B") -C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C") - -# Default schedule -s = te.create_schedule(C.op) -func = tvm.build(s, [A, B, C], target=target, name="mmult") - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - - -def evaluate_operation(s, vars, target, name, optimization, log): - func = tvm.build(s, [A, B, C], target=target, name="mmult") - assert func - - c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) - func(a, b, c) - tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - - evaluator = func.time_evaluator(func.entry_name, dev, number=10) - mean_time = evaluator(a, b, c).mean - print("%s: %f" % (optimization, mean_time)) - log.append((optimization, mean_time)) - - -log = [] - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="none", log=log) - -################################################################################ -# Let's take a look at the intermediate representation of the operator and -# default schedule using the TVM lower function. Note how the implementation is -# essentially a naive implementation of a matrix multiplication, using three -# nested loops over the indices of the A and B matrices. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 1: Blocking -# ~~~~~~~~~~~~~~~~~~~~~~~~ -# -# A important trick to enhance the cache hit rate is blocking, where you -# structure memory access such that the inside a block is a small neighborhood -# that has high memory locality. In this tutorial, we pick a block factor of -# 32. This will result in a block that will fill a 32 * 32 * sizeof(float) area -# of memory. This corresponds to a cache size of 4KB, in relation to a -# reference cache size of 32 KB for L1 cache. -# -# We begin by creating a default schedule for the ``C`` operation, then apply a -# ``tile`` scheduling primitive to it with the specified block factor, with the -# scheduling primitive returning the resulting loop order from outermost to -# innermost, as a vector ``[x_outer, y_outer, x_inner, y_inner]``. We then get -# the reduction axis for output of the operation, and perform a split operation -# on it using a factor of 4. This factor doesn't directly impact the blocking -# optimization we're working on right now, but will be useful later when we -# apply vectorization. -# -# Now that the operation has been blocked, we can reorder the computation to -# put the reduction operation into the outermost loop of the computation, -# helping to guarantee that the blocked data remains in cache. This completes -# the schedule, and we can build and test the performance compared to the naive -# schedule. - -bn = 32 - -# Blocking by loop tiling -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) - -# Hoist reduction domain outside the blocking loop -s[C].reorder(xo, yo, ko, ki, xi, yi) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="blocking", log=log) - -################################################################################ -# By reordering the computation to take advantage of caching, you should see a -# significant improvement in the performance of the computation. Now, print the -# internal representation and compare it to the original: - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 2: Vectorization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Another important optimization trick is vectorization. When the memory access -# pattern is uniform, the compiler can detect this pattern and pass the -# continuous memory to the SIMD vector processor. In TVM, we can use the -# ``vectorize`` interface to hint the compiler this pattern, taking advantage -# of this hardware feature. -# -# In this tutorial, we chose to vectorize the inner loop row data since it is -# already cache friendly from our previous optimizations. - -# Apply the vectorization optimization -s[C].vectorize(yi) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="vectorization", log=log) - -# The generalized IR after vectorization -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 3: Loop Permutation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# If we look at the above IR, we can see the inner loop row data is vectorized -# and B is transformed into PackedB (this is evident by the `(float32x32*)B2` -# portion of the inner loop). The traversal of PackedB is sequential now. So we -# will look at the access pattern of A. In current schedule, A is accessed -# column by column which is not cache friendly. If we change the nested loop -# order of `ki` and inner axes `xi`, the access pattern for A matrix will be -# more cache friendly. - -s = te.create_schedule(C.op) -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) - -# re-ordering -s[C].reorder(xo, yo, ko, xi, ki, yi) -s[C].vectorize(yi) - -evaluate_operation( - s, [A, B, C], target=target, name="mmult", optimization="loop permutation", log=log -) - -# Again, print the new generalized IR -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 4: Array Packing -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Another important trick is array packing. This trick is to reorder the -# storage dimension of the array to convert the continuous access pattern on -# certain dimension to a sequential pattern after flattening. -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png -# :align: center -# -# Just as it is shown in the figure above, after blocking the computations, we -# can observe the array access pattern of B (after flattening), which is -# regular but discontinuous. We expect that after some transformation we can -# get a continuous access pattern. By reordering a ``[16][16]`` array to a -# ``[16/4][16][4]`` array the access pattern of B will be sequential when -# grabing the corresponding value from the packed array. -# -# To accomplish this, we are going to have to start with a new default -# schedule, taking into account the new packing of B. It's worth taking a -# moment to comment on this: TE is a powerful and expressive language for -# writing optimized operators, but it often requires some knowledge of the -# underlying algorithm, data structures, and hardware target that you are -# writing for. Later in the tutorial, we will discuss some of the options for -# letting TVM take that burden. Regardless, let's move on with the new -# optimized schedule. - -# We have to re-write the algorithm slightly. -packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB") -C = te.compute( - (M, N), - lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k), - name="C", -) - -s = te.create_schedule(C.op) - -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) - -s[C].reorder(xo, yo, ko, xi, ki, yi) -s[C].vectorize(yi) - -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="array packing", log=log) - -# Here is the generated IR after array packing. -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 5: Optimizing Block Writing Through Caching -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Up to this point all of our optimizations have focused on efficiently -# accessing and computing the data from the `A` and `B` matrices to compute the -# `C` matrix. After the blocking optimization, the operator will write result -# to `C` block by block, and the access pattern is not sequential. We can -# address this by using a sequential cache array, using a combination of -# `cache_write`, `compute_at`, and `unroll`to hold the block results and write -# to `C` when all the block results are ready. - -s = te.create_schedule(C.op) - -# Allocate write cache -CC = s.cache_write(C, "global") - -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) - -# Write cache is computed at yo -s[CC].compute_at(s[C], yo) - -# New inner axes -xc, yc = s[CC].op.axis - -(k,) = s[CC].op.reduce_axis -ko, ki = s[CC].split(k, factor=4) -s[CC].reorder(ko, xc, ki, yc) -s[CC].unroll(ki) -s[CC].vectorize(yc) - -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="block caching", log=log) - -# Here is the generated IR after write cache blocking. -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 6: Parallelization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# So far, our computation is only designed to use a single core. Nearly all -# modern processors have multiple cores, and computation can benefit from -# running computations in parallel. The final optimization is to take advantage -# of thread-level parallelization. - -# parallel -s[C].parallel(xo) - -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) - -evaluate_operation( - s, [A, B, C], target=target, name="mmult", optimization="parallelization", log=log -) - -# Here is the generated IR after parallelization. -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Summary of Matrix Multiplication Example -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# After applying the above simple optimizations with only 18 lines of code, our -# generated code can begin to approach the performance of `numpy` with the Math -# Kernel Library (MKL). Since we've been logging the performance as we've been -# working, we can compare the results. - -baseline = log[0][1] -print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20))) -for result in log: - print( - "%s\t%s\t%s" - % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20)) - ) - -################################################################################ -# Note that the outputs on the web page reflect the running times on a -# non-exclusive Docker container, and should be considered unreliable. It is -# highly encouraged to run the tutorial by yourself to observe the performance -# gain achieved by TVM, and to carefully work through each example to -# understand the iterative improvements that are made to the matrix -# multiplication operation. - -################################################################################ -# Final Notes and Summary -# ----------------------- -# As mentioned earlier, how to apply optimizations using TE and scheduling -# primitives can require some knowledge of the underlying architecture and -# algorithms. However, TE was designed to act as a foundation for more complex -# algorithms that can search the potential optimization. With the knowledge you -# have from this introduction to TE, we can now begin to explore how TVM can -# automate the schedule optimization process. -# -# This tutorial provided a walkthrough of TVM Tensor Expresstion (TE) workflow -# using a vector add and a matrix multiplication examples. The general workflow -# is -# -# - Describe your computation via a series of operations. -# - Describe how we want to compute use schedule primitives. -# - Compile to the target function we want. -# - Optionally, save the function to be loaded later. -# -# Upcoming tutorials expand on the matrix multiplication example, and show how -# you can build generic templates of the matrix multiplication and other -# operations with tunable parameters that allows you to automatically optimize -# the computation for specific platforms. diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py deleted file mode 100644 index c729b86a3245..000000000000 --- a/tutorials/get_started/tvmc_command_line_driver.py +++ /dev/null @@ -1,511 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compiling and Optimizing a Model with TVMC -========================================== -**Authors**: -`Leandro Nunes `_, -`Matthew Barrett `_, -`Chris Hoge `_ - -In this section, we will work with TVMC, the TVM command line driver. TVMC is a -tool that exposes TVM features such as auto-tuning, compiling, profiling and -execution of models through a command line interface. - -Upon completion of this section, we will have used TVMC to accomplish the -following tasks: - -* Compile a pre-trained ResNet 50 v2 model for the TVM runtime. -* Run a real image through the compiled model, and interpret the output and - model performance. -* Tune the model on a CPU using TVM. -* Re-compile an optimized model using the tuning data collected by TVM. -* Run the image through the optimized model, and compare the output and model - performance. - -The goal of this section is to give you an overview of TVM and TVMC's -capabilities, and set the stage for understanding how TVM works. -""" - -################################################################################ -# Using TVMC -# ---------- -# -# TVMC is a Python application, part of the TVM Python package. -# When you install TVM using a Python package, you will get TVMC as -# as a command line application called ``tvmc``. The location of this command -# will vary depending on your platform and installation method. -# -# Alternatively, if you have TVM as a Python module on your -# ``$PYTHONPATH``,you can access the command line driver functionality -# via the executable python module, ``python -m tvm.driver.tvmc``. -# -# For simplicity, this tutorial will mention TVMC command line using -# ``tvmc ``, but the same results can be obtained with -# ``python -m tvm.driver.tvmc ``. -# -# You can check the help page using: -# -# .. code-block:: bash -# -# tvmc --help -# -# The main features of TVM available to ``tvmc`` are from subcommands -# ``compile``, and ``run``, and ``tune``. To read about specific options under -# a given subcommand, use ``tvmc --help``. We will cover each of -# these commands in this tutorial, but first we need to download a pre-trained -# model to work with. -# - - -################################################################################ -# Obtaining the Model -# ------------------- -# -# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a -# convolutional neural network that is 50-layers deep and designed to classify -# images. The model we will be using has been pre-trained on more than a -# million images with 1000 different classifications. The network has an input -# image size of 224x224. If you are interested exploring more of how the -# ResNet-50 model is structured, we recommend downloading `Netron -# `, a freely available ML model viewer. -# -# For this tutorial we will be using the model in ONNX format. -# -# .. code-block:: bash -# -# wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx -# - - -################################################################################ -# .. note:: Supported model formats -# -# TVMC supports models created with Keras, ONNX, TensorFlow, TFLite -# and Torch. Use the option``--model-format`` if you need to -# explicitly provide the model format you are using. See ``tvmc -# compile --help`` for more information. -# - -################################################################################ -# .. note:: Adding ONNX Support to TVM -# -# TVM relies on the ONNX python library being available on your system. You -# can install ONNX using the command ``pip3 install --user onnx``. You may -# remove the ``--user`` option if you have root access and want to install -# ONNX globally. -# - -################################################################################ -# Compiling an ONNX Model to the TVM Runtime -# ------------------------------------------ -# -# Once we've downloaded the ResNet-50 model, the next step is to compile it. To -# accomplish that, we are going to use ``tvmc compile``. The output we get from -# the compilation process is a TAR package of the model compiled to a dynamic -# library for our target platform. We can run that model on our target device -# using the TVM runtime. -# -# .. code-block:: bash -# -# tvmc compile \ -# --target "llvm" \ -# --output resnet50-v2-7-tvm.tar \ -# resnet50-v2-7.onnx -# -# Let's take a look at the files that ``tvmc compile`` creates in the module: -# -# .. code-block:: bash -# -# mkdir model -# tar -xvf resnet50-v2-7-tvm.tar -C model -# ls model -# -# You will see three files listed. -# -# * ``mod.so`` is the model, represented as a C++ library, that can be loaded -# by the TVM runtime. -# * ``mod.json`` is a text representation of the TVM Relay computation graph. -# * ``mod.params`` is a file containing the parameters for the pre-trained -# model. -# -# This module can be directly loaded by your application, and the model can be -# run via the TVM runtime APIs. - - -################################################################################ -# .. note:: Defining the Correct Target -# -# Specifying the correct target (option ``--target``) can have a huge -# impact on the performance of the compiled module, as it can take -# advantage of hardware features available on the target. For more -# information, please refer to `Auto-tuning a convolutional network -# for x86 CPU `_. -# We recommend identifying which CPU you are running, along with optional features, -# and set the target appropriately. -# - -################################################################################ -# Running the Model from The Compiled Module with TVMC -# ---------------------------------------------------- -# -# Now that we've compiled the model to this module, we can use the TVM runtime -# to make predictions with it. TVMC has the TVM runtime built in to it, -# allowing you to run compiled TVM models. To use TVMC to run the model and -# make predictions, we need two things: -# -# - The compiled module, which we just produced. -# - Valid input to the model to make predictions on. -# -# Each model is particular when it comes to expected tensor shapes, formats and -# data types. For this reason, most models require some pre and -# post-processing, to ensure the input is valid and to interpret the output. -# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This -# is a well-supported NumPy format to serialize multiple arrays into a file -# -# As input for this tutorial, we will use the image of a cat, but you can feel -# free to substitute image for any of your choosing. -# -# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg -# :height: 224px -# :width: 224px -# :align: center - - -################################################################################ -# Input pre-processing -# ~~~~~~~~~~~~~~~~~~~~ -# -# For our ResNet 50 V2 model, the input is expected to be in ImageNet format. -# Here is an example of a script to pre-process an image for ResNet 50 V2. -# -# You will need to have a supported version of the Python Image Library -# installed. You can use ``pip3 install --user pillow`` to satisfy this -# requirement for the script. -# -# .. code-block:: python -# :caption: preprocess.py -# :name: preprocess.py -# -# #!python ./preprocess.py -# from tvm.contrib.download import download_testdata -# from PIL import Image -# import numpy as np -# -# img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" -# img_path = download_testdata(img_url, "imagenet_cat.png", module="data") -# -# # Resize it to 224x224 -# resized_image = Image.open(img_path).resize((224, 224)) -# img_data = np.asarray(resized_image).astype("float32") -# -# # ONNX expects NCHW input, so convert the array -# img_data = np.transpose(img_data, (2, 0, 1)) -# -# # Normalize according to ImageNet -# imagenet_mean = np.array([0.485, 0.456, 0.406]) -# imagenet_stddev = np.array([0.229, 0.224, 0.225]) -# norm_img_data = np.zeros(img_data.shape).astype("float32") -# for i in range(img_data.shape[0]): -# norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i] -# -# # Add batch dimension -# img_data = np.expand_dims(norm_img_data, axis=0) -# -# # Save to .npz (outputs imagenet_cat.npz) -# np.savez("imagenet_cat", data=img_data) -# - -################################################################################ -# Running the Compiled Module -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# With both the model and input data in hand, we can now run TVMC to make a -# prediction: -# -# .. code-block:: bash -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# resnet50-v2-7-tvm.tar -# -# Recall that the `.tar` model file includes a C++ library, a description of -# the Relay model, and the parameters for the model. TVMC includes the TVM -# runtime, which can load the model and make predictions against input. When -# running the above command, TVMC outputs a new file, ``predictions.npz``, that -# contains the model output tensors in NumPy format. -# -# In this example, we are running the model on the same machine that we used -# for compilation. In some cases we might want to run it remotely via an RPC -# Tracker. To read more about these options please check ``tvmc run --help``. - -################################################################################ -# Output Post-Processing -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# As previously mentioned, each model will have its own particular way of -# providing output tensors. -# -# In our case, we need to run some post-processing to render the outputs from -# ResNet 50 V2 into a more human-readable form, using the lookup-table provided -# for the model. -# -# The script below shows an example of the post-processing to extract labels -# from the output of our compiled module. -# -# .. code-block:: python -# :caption: postprocess.py -# :name: postprocess.py -# -# #!python ./postprocess.py -# import os.path -# import numpy as np -# -# from scipy.special import softmax -# -# from tvm.contrib.download import download_testdata -# -# # Download a list of labels -# labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" -# labels_path = download_testdata(labels_url, "synset.txt", module="data") -# -# with open(labels_path, "r") as f: -# labels = [l.rstrip() for l in f] -# -# output_file = "predictions.npz" -# -# # Open the output and read the output tensor -# if os.path.exists(output_file): -# with np.load(output_file) as data: -# scores = softmax(data["output_0"]) -# scores = np.squeeze(scores) -# ranks = np.argsort(scores)[::-1] -# -# for rank in ranks[0:5]: -# print("class='%s' with probability=%f" % (labels[rank], scores[rank])) -# -# Running this script should produce the following output: -# -# .. code-block:: bash -# -# python postprocess.py -# -# # class='n02123045 tabby, tabby cat' with probability=0.610553 -# # class='n02123159 tiger cat' with probability=0.367179 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 -# -# Try replacing the cat image with other images, and see what sort of -# predictions the ResNet model makes. - -################################################################################ -# Automatically Tuning the ResNet Model -# ------------------------------------- -# -# The previous model was compiled to work on the TVM runtime, but did not -# include any platform specific optimization. In this section, we will show you -# how to build an optimized model using TVMC to target your working platform. -# -# In some cases, we might not get the expected performance when running -# inferences using our compiled module. In cases like this, we can make use of -# the auto-tuner, to find a better configuration for our model and get a boost -# in performance. Tuning in TVM refers to the process by which a model is -# optimized to run faster on a given target. This differs from training or -# fine-tuning in that it does not affect the accuracy of the model, but only -# the runtime performance. As part of the tuning process, TVM will try running -# many different operator implementation variants to see which perform best. -# The results of these runs are stored in a tuning records file, which is -# ultimately the output of the ``tune`` subcommand. -# -# In the simplest form, tuning requires you to provide three things: -# -# - the target specification of the device you intend to run this model on -# - the path to an output file in which the tuning records will be stored, and -# finally -# - a path to the model to be tuned. -# -# The example below demonstrates how that works in practice: -# -# .. code-block:: bash -# -# tvmc tune \ -# --target "llvm" \ -# --output resnet50-v2-7-autotuner_records.json \ -# resnet50-v2-7.onnx -# -# In this example, you will see better results if you indicate a more specific -# target for the `--target` flag. For example, on an Intel i7 processor you -# could use `--target llvm -mcpu=skylake`. For this tuning example, we are -# tuning locally on the CPU using LLVM as the compiler for the specified -# achitecture. -# -# TVMC will perform a search against the parameter space for the model, trying -# out different configurations for operators and choosing the one that runs -# fastest on your platform. Although this is a guided search based on the CPU -# and model operations, it can still take several hours to complete the search. -# The output of this search will be saved to the -# `resnet50-v2-7-autotuner_records.json` file, which will later be used to -# compile an optimized model. -# -# .. note:: Defining the Tuning Search Algorithm -# -# By default this search is guided using an `XGBoost Grid` algorithm. -# Depending on your model complexity and amount of time avilable, you might -# want to choose a different algorithm. A full list is available by -# consulting ``tvmc tune --help``. -# -# The output will look something like this for a consumer-level Skylake CPU: -# -# .. code-block:: bash -# -# tvmc tune --target "llvm -mcpu=broadwell" --output resnet50-v2-7-autotuner_records.json resnet50-v2-7.onnx -# # [Task 1/24] Current/Best: 9.65/ 23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done. -# # [Task 1/24] Current/Best: 3.56/ 23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done. -# # [Task 2/24] Current/Best: 13.13/ 58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done. -# # [Task 3/24] Current/Best: 31.93/ 59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done. -# # [Task 4/24] Current/Best: 16.42/ 57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done. -# # [Task 5/24] Current/Best: 12.42/ 57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done. -# # [Task 6/24] Current/Best: 20.66/ 59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done. -# # [Task 7/24] Current/Best: 15.48/ 59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done. -# # [Task 8/24] Current/Best: 31.97/ 59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done. -# # [Task 9/24] Current/Best: 34.14/ 60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done. -# # [Task 10/24] Current/Best: 12.53/ 58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done. -# # [Task 11/24] Current/Best: 30.94/ 58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done. -# # [Task 12/24] Current/Best: 23.66/ 58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done. -# # [Task 13/24] Current/Best: 25.44/ 59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done. -# # [Task 14/24] Current/Best: 26.83/ 58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done. -# # [Task 15/24] Current/Best: 33.64/ 58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done. -# # [Task 16/24] Current/Best: 14.93/ 57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done. -# # [Task 17/24] Current/Best: 28.70/ 58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done. -# # [Task 18/24] Current/Best: 19.01/ 60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done. -# # [Task 19/24] Current/Best: 14.61/ 57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done. -# # [Task 20/24] Current/Best: 10.47/ 57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done. -# # [Task 21/24] Current/Best: 34.37/ 58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done. -# # [Task 22/24] Current/Best: 15.75/ 57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done. -# # [Task 23/24] Current/Best: 23.23/ 58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done. -# # [Task 24/24] Current/Best: 17.27/ 55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done. -# -# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning -# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning -# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information. -# - -################################################################################ -# Compiling an Optimized Model with Tuning Data -# ---------------------------------------------- -# -# As an output of the tuning process above, we obtained the tuning records -# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in -# two ways: -# -# - As input to further tuning (via ``tvmc tune --tuning-records``). -# - As input to the compiler -# -# The compiler will use the results to generate high performance code for the -# model on your specified target. To do that we can use ``tvmc compile -# --tuning-records``. Check ``tvmc compile --help`` for more information. -# -# Now that tuning data for the model has been collected, we can re-compile the -# model using optimized operators to speed up our computations. -# -# .. code-block:: bash -# -# tvmc compile \ -# --target "llvm" \ -# --tuning-records resnet50-v2-7-autotuner_records.json \ -# --output resnet50-v2-7-tvm_autotuned.tar \ -# resnet50-v2-7.onnx -# -# Verify that the optimized model runs and produces the same results: -# -# .. code-block:: bash -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# resnet50-v2-7-tvm_autotuned.tar -# -# python postprocess.py -# -# Verifying that the predictions are the same: -# -# .. code-block:: bash -# -# # class='n02123045 tabby, tabby cat' with probability=0.610550 -# # class='n02123159 tiger cat' with probability=0.367181 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 - -################################################################################ -# Comparing the Tuned and Untuned Models -# -------------------------------------- -# -# TVMC gives you tools for basic performance benchmarking between the models. -# You can specify a number of repetitions and that TVMC report on the model run -# time (independent of runtime startup). We can get a rough idea of how much -# tuning has improved the model performance. For example, on a test Intel i7 -# system, we see that the tuned model runs 47% faster than the untuned model: -# -# .. code-block:: bash -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# --print-time \ -# --repeat 100 \ -# resnet50-v2-7-tvm_autotuned.tar -# -# # Execution time summary: -# # mean (ms) max (ms) min (ms) std (ms) -# # 92.19 115.73 89.85 3.15 -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# --print-time \ -# --repeat 100 \ -# resnet50-v2-7-tvm.tar -# -# # Execution time summary: -# # mean (ms) max (ms) min (ms) std (ms) -# # 193.32 219.97 185.04 7.11 -# - - -################################################################################ -# Final Remarks -# ------------- -# -# In this tutorial, we presented TVMC, a command line driver for TVM. We -# demonstrated how to compile, run, and tune a model. We also discussed the -# need for pre and post-processing of inputs and outputs. After the tuning -# process, we demonstrated how to compare the performance of the unoptimized -# and optimize models. -# -# Here we presented a simple example using ResNet 50 V2 locally. However, TVMC -# supports many more features including cross-compilation, remote execution and -# profiling/benchmarking. -# -# To see what other options are available, please have a look at ``tvmc -# --help``. -# -# In the next tutorial, `Compiling and Optimizing a Model with the Python -# Interface `_, we will cover the same compilation -# and optimization steps using the Python interface. diff --git a/tutorials/language/README.txt b/tutorials/language/README.txt deleted file mode 100644 index 6da8e3c57c1f..000000000000 --- a/tutorials/language/README.txt +++ /dev/null @@ -1,2 +0,0 @@ -Tensor Expression and Schedules -------------------------------- diff --git a/tutorials/micro/README.txt b/tutorials/micro/README.txt deleted file mode 100644 index 70a5e580ecd1..000000000000 --- a/tutorials/micro/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -.. _tutorial-micro: - -microTVM --------- diff --git a/tutorials/topi/README.txt b/tutorials/topi/README.txt deleted file mode 100644 index eae0aafafc02..000000000000 --- a/tutorials/topi/README.txt +++ /dev/null @@ -1,2 +0,0 @@ -TOPI: TVM Operator Inventory -----------------------------