From c511f209095b82eca463e9a7863dffff93068332 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Mon, 7 Dec 2020 18:18:16 +0800
Subject: [PATCH 1/5] [Auto Scheduler] Mali Support

---
 .../tvm/auto_scheduler/relay_integration.py   |   8 +-
 python/tvm/relay/op/strategy/mali.py          |  51 +++
 python/tvm/topi/mali/conv2d.py                |  75 +++-
 python/tvm/topi/nn/conv2d.py                  |  10 +-
 .../search_policy/sketch_policy.cc            |  65 ++--
 src/auto_scheduler/search_task.cc             |  16 +
 src/relay/backend/build_module.cc             |   5 +-
 tutorials/auto_scheduler/tune_network_mali.py | 328 ++++++++++++++++++
 8 files changed, 512 insertions(+), 46 deletions(-)
 create mode 100644 tutorials/auto_scheduler/tune_network_mali.py

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 4c493d1d9366..2b26fc4931bd 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -259,9 +259,13 @@ def auto_schedule_topi(outs, has_complex_op):
 
     key = register_workload_tensors(dag.hash_key(), io_tensors)
 
-    # only enable layout rewrite for cpu backend
+    # only enable layout rewrite for cpu / mali backend
     target = tvm.target.Target.current()
-    enable_layout_rewrite = "cpu" in target.keys
+    enable_layout_rewrite_targets = ["cpu", "mali"]
+    enable_layout_rewrite = any(
+        enable_layout_rewrite_target in target.keys
+        for enable_layout_rewrite_target in enable_layout_rewrite_targets
+    )
 
     env = TracingEnvironment.current
     if env is None:
diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
index f6ea911a15bf..c3db8980f89c 100644
--- a/python/tvm/relay/op/strategy/mali.py
+++ b/python/tvm/relay/op/strategy/mali.py
@@ -18,8 +18,10 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import re
 from tvm import topi
+from tvm.auto_scheduler import is_auto_scheduler_enabled
 from .generic import *
 from .. import op as _op
+from .cuda import naive_schedule
 
 
 @conv2d_strategy.register("mali")
@@ -69,6 +71,36 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
                 raise RuntimeError(
                     "Unsupported weight layout {} for conv2d NCHW".format(kernel_layout)
                 )
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            if not is_auto_scheduler_enabled():
+                logger.error("conv2d NHWC layout is not enabled for mali with autotvm.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
+                naive_schedule,
+                name="conv2d_nhwc.mali",
+            )
+            is_winograd_applicable = False
+            if len(kernel.shape) == 4:
+                kernel_h, kernel_w, _, _ = get_const_tuple(kernel.shape)
+                is_winograd_applicable = (
+                    "float" in data.dtype
+                    and "float" in kernel.dtype
+                    and kernel_h == 3
+                    and kernel_w == 3
+                    and stride_h == 1
+                    and stride_w == 1
+                    and dilation_h == 1
+                    and dilation_w == 1
+                )
+            if is_winograd_applicable:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
+                    naive_schedule,  # this implementation should never be picked by autotvm
+                    name="conv2d_nhwc.winograd",
+                    plevel=15,
+                )
+
         else:
             raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout))
     elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
@@ -79,6 +111,15 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw),
                 name="depthwise_conv2d_nchw.mali",
             )
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            if not is_auto_scheduler_enabled():
+                logger.error("depthwise_conv2d NHWC layout is not enabled for mali with autotvm.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                naive_schedule,
+                name="depthwise_conv2d_nhwc.mali",
+            )
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout))
     else:  # group_conv2d
@@ -105,6 +146,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty
             wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd),
             name="conv2d_nchw_winograd.mali",
         )
+    elif layout == "NHWC":
+        if is_auto_scheduler_enabled():
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
+                naive_schedule,  # this implementation should never be picked by autotvm
+                name="conv2d_nhwc_winograd_without_weight_transform",
+                plevel=15,
+            )
+        else:
+            logger.error("AutoTVM doesn't support NHWC winograd on Mali currently")
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py
index eb4005eb37c7..314b21febcd7 100644
--- a/python/tvm/topi/mali/conv2d.py
+++ b/python/tvm/topi/mali/conv2d.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
 """conv2d schedule on ARM Mali GPU"""
+import logging
 import tvm
 from tvm import te
 from tvm import relay
@@ -25,10 +26,13 @@
 from ..utils import traverse_inline, get_const_int, get_const_tuple
 from .. import nn
 from ..nn.winograd_util import winograd_transform_matrices
+from ..nn.conv2d import conv2d_winograd_nhwc, _conv2d_winograd_nhwc_impl
 
 # reuse some compute declarations from ARM CPU
 from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw
 
+logger = logging.getLogger("topi")
+
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.mali")
 def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
@@ -188,8 +192,12 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
 
 
 ##### WINOGRAD TEMPLATE #####
-def _pick_tile_size(data, kernel):
-    N, CI, H, W = get_const_tuple(data.shape)
+def _pick_tile_size(data, kernel, layout="NCHW"):
+    if layout == "NCHW":
+        N, CI, H, W = get_const_tuple(data.shape)
+    else:
+        assert layout == "NHWC"
+        N, H, W, CI = get_const_tuple(data.shape)
 
     if H % 4 == 0:
         return 4
@@ -467,13 +475,47 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
 
-    _, outs = relay.backend.compile_engine.select_implementation(
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
+
+    impl, outs = relay.backend.compile_engine.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
     if workload is None:
-        # The best implementation is not an AutoTVM template,
-        # we then assume it's not necessary to alter this op.
+        # The best implementation is not an AutoTVM template.
+        # It may be from the auto-scheduler
+        if impl.name.find("winograd") != -1:
+            if dilation != (1, 1):
+                logger.warning("Does not support weight pre-transform for dilated convolution.")
+                return None
+
+            assert data_layout == "NHWC" and kernel_layout == "HWIO"
+            N, H, W, CI = get_const_tuple(data.shape)
+            KH, KW, _, CO = get_const_tuple(kernel.shape)
+
+            # Pre-compute weight transformation in winograd
+            tile_size = _pick_tile_size(tinfos[0], tinfos[1], layout="NHWC")
+
+            # HWIO -> OIHW
+            kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1])
+            # alpha, alpha, CO, CI
+            weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                kernel_transform, tile_size=tile_size
+            )
+            new_attrs["tile_size"] = tile_size
+            new_attrs["channels"] = CO
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
         return None
     cfg = dispatch_ctx.query(target, workload)
     if cfg.is_fallback:  # if is fallback, clear query cache and return None
@@ -481,16 +523,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         return None
 
     topi_tmpl = workload[0]
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs["data_layout"]
-    kernel_layout = attrs["kernel_layout"]
-    data, kernel = tinfos
-    out_dtype = out_type.dtype
-
     idxd = tvm.tir.indexdiv
 
     if topi_tmpl == "conv2d_nchw_spatial_pack.mali":
@@ -545,6 +577,19 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         return None
 
 
+@conv2d_winograd_nhwc.register(["mali"])
+def conv2d_winograd_nhwc_cuda(
+    data, weight, strides, padding, dilation, out_dtype, pre_computed=False
+):
+    """Conv2D Winograd in NHWC layout.
+    This is a clean version to be used by the auto-scheduler for mali.
+    """
+    tile_size = _pick_tile_size(data, weight, layout="NHWC")
+    return _conv2d_winograd_nhwc_impl(
+        data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed
+    )
+
+
 ##### SCHECULE UTILITIES #####
 def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
     """ tile and bind to GPU threads """
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 8d591a20839a..8de696cf3fc2 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -382,7 +382,15 @@ def conv2d_nhwc(
     if auto_scheduler_rewritten_layout:
         # Infer shape for the rewritten layout
         # todo(merrymercy): wrap this with a more general interface.
-        if len(Filter.shape) >= 10:
+        if len(Filter.shape) == 17:
+            # For mali
+            kernel_h = Filter.shape[6] * Filter.shape[9] * Filter.shape[13]
+            kernel_w = Filter.shape[7] * Filter.shape[10] * Filter.shape[14]
+            channel = Filter.shape[8] * Filter.shape[11] * Filter.shape[15]
+            num_filter = Filter.shape[12] * Filter.shape[16]
+            for i in range(6):
+                num_filter *= Filter.shape[i]
+        elif len(Filter.shape) >= 10:
             # For cpu tile structure SSRSRS
             base = len(Filter.shape) - 10
             kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base]
diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc
index e81e824626d6..e2678373ef8b 100644
--- a/src/auto_scheduler/search_policy/sketch_policy.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy.cc
@@ -115,21 +115,35 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model,
     node->mutation_rules.push_back(std::make_shared<MutateParallel>(0.01));
   } else if (IsGPUTask(node->search_task)) {
     // Sketch Generation Rules
-    node->sketch_rules.push_back(&rule_add_cache_read_stage);
-    node->sketch_rules.push_back(&rule_special_compute_location_gpu);
-    node->sketch_rules.push_back(&rule_always_inline);
-    node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor);
-    node->sketch_rules.push_back(&rule_cross_thread_reduction);
-    node->sketch_rules.push_back(&rule_add_cache_write_stage);
-    node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion);
-    node->sketch_rules.push_back(&rule_multi_level_tiling);
-    node->sketch_rules.push_back(&rule_skip_stage);
+    if (node->search_task->target->GetAttr<String>("device", "") == "mali") {
+      node->sketch_rules.push_back(&rule_always_inline);
+      node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor);
+      node->sketch_rules.push_back(&rule_add_rfactor);
+      node->sketch_rules.push_back(&rule_add_cache_write_stage);
+      node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion);
+      node->sketch_rules.push_back(&rule_multi_level_tiling);
+      node->sketch_rules.push_back(&rule_skip_stage);
+    } else {
+      node->sketch_rules.push_back(&rule_add_cache_read_stage);
+      node->sketch_rules.push_back(&rule_special_compute_location_gpu);
+      node->sketch_rules.push_back(&rule_always_inline);
+      node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor);
+      node->sketch_rules.push_back(&rule_cross_thread_reduction);
+      node->sketch_rules.push_back(&rule_add_cache_write_stage);
+      node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion);
+      node->sketch_rules.push_back(&rule_multi_level_tiling);
+      node->sketch_rules.push_back(&rule_skip_stage);
+    }
 
     // Initial Population Generation Rules
     node->init_rules.push_back(&init_fill_tile_size);
     node->init_rules.push_back(&init_thread_bind);
     node->init_rules.push_back(&init_unroll);
 
+    if (node->search_task->target->GetAttr<String>("device", "") == "mali") {
+      node->init_rules.push_back(&init_vectorization);
+    }
+
     // Mutation Rules for Evolutionary Search
     node->mutation_rules.push_back(std::make_shared<MutateTileSize>(0.90));
     node->mutation_rules.push_back(std::make_shared<MutateAutoUnroll>(0.10));
@@ -389,23 +403,22 @@ Array<State> SketchPolicyNode::SampleInitPopulation(const Array<State>& sketches
     std::vector<State> temp_states(population);
 
     // Sample a batch of states randomly
-    support::parallel_for(0, population,
-                          [this, &temp_states, &sketches, &rand_gens](int index) {
-                            // Randomly choose a sketch
-                            State tmp_s = sketches[(rand_gens[index])() % sketches.size()];
-                            // Apply random annotation rules one by one
-                            bool valid = true;
-                            for (const auto& rule : init_rules) {
-                              if (rule->Apply(this, &tmp_s, &rand_gens[index]) ==
-                                  PopulationGenerationRule::ResultKind::kInvalid) {
-                                valid = false;
-                                break;
-                              }
-                            }
-                            if (valid) {
-                              temp_states[index] = std::move(tmp_s);
-                            }
-                          });
+    support::parallel_for(0, population, [this, &temp_states, &sketches, &rand_gens](int index) {
+      // Randomly choose a sketch
+      State tmp_s = sketches[(rand_gens[index])() % sketches.size()];
+      // Apply random annotation rules one by one
+      bool valid = true;
+      for (const auto& rule : init_rules) {
+        if (rule->Apply(this, &tmp_s, &rand_gens[index]) ==
+            PopulationGenerationRule::ResultKind::kInvalid) {
+          valid = false;
+          break;
+        }
+      }
+      if (valid) {
+        temp_states[index] = std::move(tmp_s);
+      }
+    });
 
     // Filter out the states that were failed to apply initial rules
     Array<State> cand_states;
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 5a3475542878..f6f129795447 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -90,6 +90,22 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
     int max_vthread_extent = warp_size / 4;
     return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
                           max_threads_per_block, max_vthread_extent, warp_size);
+  } else if (target->kind->device_type == kDLOpenCL) {
+    if (target->GetAttr<String>("device", "") == "mali") {
+      // We can not use device api to get attr like CUDA
+      // because like Mali target is normally on the remote machine
+      int max_shared_memory_per_block = 32768;
+      int max_local_memory_per_block = INT32_MAX;  // skip the check on local memory
+      int max_threads_per_block = 256;
+      int warp_size = 1;
+      int max_vthread_extent = 1;
+      return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
+                            max_threads_per_block, max_vthread_extent, warp_size);
+    } else {
+      // add other opencl target
+      auto target_device = target->GetAttr<String>("device", "");
+      LOG(FATAL) << "No default hardware parameters for opencl target device: " << target_device;
+    }
   } else {
     LOG(FATAL) << "No default hardware parameters for target: " << target;
   }
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index a0828d1cac6c..d10a7c307d9a 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -343,8 +343,9 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (backend::IsAutoSchedulerEnabled() && targets.size() == 1) {
       const auto& target = (*targets.begin()).second;
       Pass major_pass = transform::AutoSchedulerLayoutRewrite();
-
-      if (target->kind->device_type == kDLCPU && pass_ctx.PassEnabled(major_pass->Info())) {
+      bool enable_layout_rewrite_targets =
+          target->kind->device_type == kDLCPU || target->GetAttr<String>("device", "") == "mali";
+      if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
         With<Target> tctx(target);
         relay_module = major_pass(relay_module);
         // Defuse ops to fold constants, then fuse them again
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
new file mode 100644
index 000000000000..9ec3fc3969fd
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -0,0 +1,328 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-scheduling a Neural Network for mali GPU
+============================================
+**Author**: `Zhao Wu <https://github.com/FrozenGene>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole neural
+network for mali GPU with the auto-scheduler.
+
+To auto-tune a neural network, we partition the network into small subgraphs and 
+tune them independently. Each subgraph is treated as one search task.
+A task scheduler slices the time and dynamically allocates time resources to
+these tasks. The task scheduler predicts the impact of each task on the end-to-end
+execution time and prioritizes the one that can reduce the execution time the most.
+
+For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to
+get the computational DAG in the tensor expression form.
+We then use the auto-scheduler to construct a search space of this DAG and search
+for good schedules (low-level optimizations).
+
+Different from the template-based :ref:`autotvm <tutorials-autotvm-sec>` which relies on
+manual templates to define the search space, the auto-scheduler does not require any
+schedule templates. In other words, the auto-scheduler only uses the compute declarations
+in :code:`tvm/python/topi` and does not use existing schedule templates.
+
+Note that this tutorial will not run on Windows or recent versions of macOS. To
+get it to run, you will need to wrap the body of this tutorial in a :code:`if
+__name__ == "__main__":` block.
+"""
+
+import numpy as np
+
+import tvm
+from tvm import relay, auto_scheduler
+import tvm.relay.testing
+from tvm.contrib import graph_runtime
+import os
+
+#################################################################
+# Define a Network
+# ----------------
+# First, we need to define the network with relay frontend API.
+# We can load some pre-defined network from :code:`tvm.relay.testing`.
+# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow
+# (see :ref:`front end tutorials<tutorial-frontend>`).
+#
+# For convolutional neural networks, although auto-scheduler can work correctly
+# with any layout, we found the best performance is typically achieved with NHWC layout.
+# We also implemented more optimizations for NHWC layout with the auto-scheduler.
+# So it is recommended to convert your models to NHWC layout to use the auto-scheduler.
+# You can use :ref:`ConvertLayout <convert-layout-usage>` pass to do the layout conversion in TVM.
+
+
+def get_network(name, batch_size, layout="NHWC", dtype="float32"):
+    """Get the symbol definition and random weight of a network"""
+
+    # auto-scheduler prefers NHWC layout
+    if layout == "NHWC":
+        image_shape = (224, 224, 3)
+    elif layout == "NCHW":
+        image_shape = (3, 224, 224)
+    else:
+        raise ValueError("Invalid layout: " + layout)
+
+    input_shape = (batch_size,) + image_shape
+    output_shape = (batch_size, 1000)
+
+    if name.startswith("resnet-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name.startswith("resnet3d-"):
+        n_layer = int(name.split("-")[1])
+        mod, params = relay.testing.resnet.get_workload(
+            num_layers=n_layer,
+            batch_size=batch_size,
+            layout=layout,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "mobilenet":
+        mod, params = relay.testing.mobilenet.get_workload(
+            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
+        )
+    elif name == "squeezenet_v1.1":
+        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
+        mod, params = relay.testing.squeezenet.get_workload(
+            version="1.1",
+            batch_size=batch_size,
+            dtype=dtype,
+            image_shape=image_shape,
+        )
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
+        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        assert layout == "NCHW"
+
+        block = get_model("resnet50_v1", pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = mod["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        mod = tvm.IRModule.from_expr(net)
+
+    return mod, params, input_shape, output_shape
+
+
+# Define the neural network and compilation target.
+network = "mobilenet"
+batch_size = 1
+layout = "NHWC"
+# replace this with the device key in your tracker
+device_key = "rk3399"
+# Set this to True if you use ndk tools for cross compiling
+use_ndk = True
+# Path to cross compiler
+os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++"
+target_host = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu")
+target = tvm.target.Target("opencl -device=mali")
+dtype = "float32"
+log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
+
+#################################################################
+# Extract Search Tasks
+# --------------------
+# Next, we extract the search tasks and their weights from a network.
+# The weight of a task is the number of appearances of the task's subgraph
+# in the whole network.
+# By using the weight, we can approximate the end-to-end latency of the network
+# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the
+# latency of a task and :code:`weight[t]` is the weight of the task.
+# The task scheduler will just optimize this objective.
+
+# Extract tasks from the network
+print("Extract tasks...")
+mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
+from tvm.auto_scheduler.utils import (
+    call_func_with_timeout,
+    check_remote,
+    get_const_tuple,
+    make_traceback_info,
+    request_remote,
+)
+
+tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host)
+
+for idx, task in enumerate(tasks):
+    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
+    print(task.compute_dag)
+
+#################################################################
+# Tuning and Evaluate
+# ------------
+# Now, we set some options for tuning, launch the search tasks and evaluate the end-to-end performance
+#
+# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
+#   You can set it to a small number (e.g., 200) for a fast demonstrative run.
+#   In practice, we recommend setting it around :code:`800 * len(tasks)`,
+#   which is typically enough for the search to converge.
+#   For example, there are 29 tasks in resnet-50, so we can set it as 20000.
+#   You can adjust this parameter according to your time budget.
+# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file,
+#   The measurement records can be used to query the history best, resume the search,
+#   and do more analyses later.
+# * see :any:`auto_scheduler.TuningOptions`,
+#   :any:`auto_scheduler.LocalRunner` for more parameters.
+#
+
+
+def tune_and_evaluate():
+    print("Begin tuning...")
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=200,  # change this to 20000 to achieve the best performance
+        builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
+        runner=auto_scheduler.RPCRunner(
+            device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50
+        ),
+        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+    )
+
+    tuner.tune(tune_option)
+
+    # Compile the whole network
+    print("Compile...")
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(
+            opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+        ):
+            lib = relay.build(mod, target=target, target_host=target_host, params=params)
+
+    # Create graph runtime
+    print("=============== Request Remote ===============")
+    remote = request_remote(device_key, "0.0.0.0", 9190)
+    ctx = remote.cl()
+    from tvm.contrib import utils, ndk
+
+    temp = utils.tempdir()
+    filename = "deploy_lib.so"
+    path_lib = temp.relpath(filename)
+    lib.export_library(path_lib, ndk.create_shared)
+    remote.upload(path_lib)
+    loaded_lib = remote.load_module(filename)
+    module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
+    data = (np.random.uniform(size=input_shape)).astype(dtype)
+    data_tvm = tvm.nd.array(data)
+    module.set_input("data", data_tvm)
+
+    # Evaluate
+    print("Evaluate inference time cost...")
+    ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
+    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
+    print(
+        "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))
+    )
+
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate()
+
+######################################################################
+# .. note:: Explain the printed information during tuning
+#
+#   During the tuning, a lot of information will be printed on the console.
+#   They are used for debugging purposes. The most important info is the output
+#   of the task scheduler. The following table is a sample output.
+#
+#   .. code-block:: c
+#
+#     ----------------------------------------------------------------------
+#     ------------------------------  [ Task Scheduler ]
+#     ----------------------------------------------------------------------
+#     |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
+#     -------------------------------------------------
+#     |    0 |        0.010 |           0.40 |     64 |
+#     |    1 |        0.087 |          47.19 |     64 |
+#     |    2 |        0.008 |          -0.00 |     64 |
+#     |    3 |        0.177 |         582.07 |     64 |
+#     |    4 |        0.268 |         862.37 |    256 |
+#     |    5 |        0.166 |         621.13 |    128 |
+#     |    6 |        0.170 |         605.10 |    128 |
+#     |    7 |        0.128 |         403.20 |     64 |
+#     |    8 |        0.189 |         545.71 |     64 |
+#     |    9 |        0.231 |        1001.01 |    448 |
+#     |   10 |        0.155 |         664.80 |    256 |
+#     |   11 |        0.155 |         662.86 |    256 |
+#     |   12 |        0.119 |         434.08 |     64 |
+#     |   13 |        0.199 |         522.13 |     64 |
+#     |   14 |        0.235 |         986.56 |    320 |
+#     |   15 |        0.149 |         689.13 |    128 |
+#     |   16 |        0.155 |         664.80 |    192 |
+#     |   17 |        0.151 |         340.64 |     64 |
+#     |   18 |        0.176 |         597.55 |    128 |
+#     |   19 |        0.220 |        1054.37 |    192 |
+#     |   20 |        0.150 |         686.01 |    128 |
+#     |   21 |        0.159 |         650.88 |    128 |
+#     |   22 |        0.073 |         358.19 |     64 |
+#     |   23 |        0.031 |          70.63 |     64 |
+#     |   24 |        0.251 |         947.73 |    128 |
+#     |   25 |        0.157 |         652.47 |    128 |
+#     |   26 |        0.215 |         954.84 |    128 |
+#     |   27 |        0.237 |         868.92 |    128 |
+#     |   28 |        0.266 |         774.06 |    128 |
+#     -------------------------------------------------
+#     Estimated total latency: 10.016 ms      Trials: 3992    Used time : 1131 s      Next ID: 15
+#
+#   This table lists the latency and (estimated) speed of all tasks.
+#   It also lists the allocation of measurement trials for all tasks.
+#   The last line prints the total weighted latency of these tasks,
+#   which can be a rough estimation of the end-to-end execution time
+#   of the network.
+#   The last line also prints the total number of measurement trials,
+#   total time spent on auto-tuning and the id of the next task to tune.
+#
+#   There will also be some "dmlc::Error"s errors, because the
+#   auto-scheduler will try some invalid schedules.
+#   You can safely ignore them if the tuning can continue, because these
+#   errors are isolated from the main process.
+#
+
+######################################################################
+# .. note:: Terminate the tuning earlier
+#
+#   You can terminate the tuning earlier by forcibly killing this process.
+#   As long as you get at least one valid schedule for each task in the log file,
+#   you should be able to do the compilation (the secion below).
+#
+
+#################################################################
+# Other Tips
+# ----------
+# 1. During the tuning, the auto-scheduler needs to compile many programs and
+#    extract feature from them. This part is CPU-intensive,
+#    so a high-performance CPU with many cores is recommended for faster search.
+# 2. If you have multiple target CPUs, you can use all of them for measurements to
+#    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
+#    to learn how to use the RPC Tracker and RPC Server.
+#    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`
+#    with :any:`auto_scheduler.RPCRunner`.
+#

From 6c156c6a4770fb4e0dd1aa932e58a635028d917d Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Fri, 18 Dec 2020 21:41:47 +0800
Subject: [PATCH 2/5] Fix doc

---
 tutorials/auto_scheduler/tune_network_mali.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index 9ec3fc3969fd..a9449d7ca498 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -16,7 +16,7 @@
 # under the License.
 """
 Auto-scheduling a Neural Network for mali GPU
-============================================
+=============================================
 **Author**: `Zhao Wu <https://github.com/FrozenGene>`_
 
 Auto-tuning for specific devices and workloads is critical for getting the
@@ -160,14 +160,6 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # Extract tasks from the network
 print("Extract tasks...")
 mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
-from tvm.auto_scheduler.utils import (
-    call_func_with_timeout,
-    check_remote,
-    get_const_tuple,
-    make_traceback_info,
-    request_remote,
-)
-
 tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host)
 
 for idx, task in enumerate(tasks):
@@ -176,7 +168,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 
 #################################################################
 # Tuning and Evaluate
-# ------------
+# -------------------
 # Now, we set some options for tuning, launch the search tasks and evaluate the end-to-end performance
 #
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning.
@@ -217,10 +209,10 @@ def tune_and_evaluate():
 
     # Create graph runtime
     print("=============== Request Remote ===============")
+    from tvm.auto_scheduler.utils import request_remote
     remote = request_remote(device_key, "0.0.0.0", 9190)
     ctx = remote.cl()
     from tvm.contrib import utils, ndk
-
     temp = utils.tempdir()
     filename = "deploy_lib.so"
     path_lib = temp.relpath(filename)
@@ -241,7 +233,7 @@ def tune_and_evaluate():
     )
 
 
-# We do not run the tuning in our webpage server since it takes too long.
+# We do not run the tuning in our webpage server since server doesn't have mali gpu.
 # Uncomment the following line to run it by yourself.
 
 # tune_and_evaluate()
@@ -320,7 +312,7 @@ def tune_and_evaluate():
 # 1. During the tuning, the auto-scheduler needs to compile many programs and
 #    extract feature from them. This part is CPU-intensive,
 #    so a high-performance CPU with many cores is recommended for faster search.
-# 2. If you have multiple target CPUs, you can use all of them for measurements to
+# 2. If you have multiple target devices, you can use all of them for measurements to
 #    parallelize the measurements. Check this :ref:`section <tutorials-autotvm-rpc-tracker>`
 #    to learn how to use the RPC Tracker and RPC Server.
 #    To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions`

From 819f1775c5f34e4bf16e5ce3ed819ad806e5cecc Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Fri, 18 Dec 2020 22:21:26 +0800
Subject: [PATCH 3/5] fix lint

---
 tutorials/auto_scheduler/tune_network_mali.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index a9449d7ca498..618f3c921a6e 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -210,9 +210,11 @@ def tune_and_evaluate():
     # Create graph runtime
     print("=============== Request Remote ===============")
     from tvm.auto_scheduler.utils import request_remote
+
     remote = request_remote(device_key, "0.0.0.0", 9190)
     ctx = remote.cl()
     from tvm.contrib import utils, ndk
+
     temp = utils.tempdir()
     filename = "deploy_lib.so"
     path_lib = temp.relpath(filename)

From 38a3668dcc7c94f15e503cc99f2227898b5e2586 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Mon, 21 Dec 2020 20:08:24 +0800
Subject: [PATCH 4/5] address comments

---
 python/tvm/relay/op/strategy/mali.py          | 26 +++++++++++--------
 python/tvm/topi/mali/conv2d.py                |  2 +-
 python/tvm/topi/nn/conv2d.py                  |  5 +++-
 src/auto_scheduler/search_task.cc             |  4 +--
 tutorials/auto_scheduler/tune_network_mali.py | 26 ++++++++++++++++++-
 5 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
index c3db8980f89c..c4cb4a135e8e 100644
--- a/python/tvm/relay/op/strategy/mali.py
+++ b/python/tvm/relay/op/strategy/mali.py
@@ -21,7 +21,6 @@
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from .generic import *
 from .. import op as _op
-from .cuda import naive_schedule
 
 
 @conv2d_strategy.register("mali")
@@ -74,7 +73,9 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
-                logger.error("conv2d NHWC layout is not enabled for mali with autotvm.")
+                raise RuntimeError(
+                    "conv2d NHWC layout is not enabled for mali without auto_scheduler."
+                )
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
                 naive_schedule,
@@ -114,7 +115,9 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             if not is_auto_scheduler_enabled():
-                logger.error("depthwise_conv2d NHWC layout is not enabled for mali with autotvm.")
+                raise RuntimeError(
+                    "depthwise_conv2d NHWC layout is not enabled for mali without auto_scheduler."
+                )
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                 naive_schedule,
@@ -147,15 +150,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty
             name="conv2d_nchw_winograd.mali",
         )
     elif layout == "NHWC":
-        if is_auto_scheduler_enabled():
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
-                naive_schedule,  # this implementation should never be picked by autotvm
-                name="conv2d_nhwc_winograd_without_weight_transform",
-                plevel=15,
+        if not is_auto_scheduler_enabled():
+            raise RuntimeError(
+                "Winograd conv2d NHWC is not enabled for mali without auto_scheduler."
             )
-        else:
-            logger.error("AutoTVM doesn't support NHWC winograd on Mali currently")
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
+            naive_schedule,  # this implementation should never be picked by autotvm
+            name="conv2d_nhwc_winograd_without_weight_transform",
+            plevel=15,
+        )
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py
index 314b21febcd7..6da3c5f6ec68 100644
--- a/python/tvm/topi/mali/conv2d.py
+++ b/python/tvm/topi/mali/conv2d.py
@@ -578,7 +578,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
 
 @conv2d_winograd_nhwc.register(["mali"])
-def conv2d_winograd_nhwc_cuda(
+def conv2d_winograd_nhwc_mali(
     data, weight, strides, padding, dilation, out_dtype, pre_computed=False
 ):
     """Conv2D Winograd in NHWC layout.
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 8de696cf3fc2..ead9f16a256f 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -383,7 +383,10 @@ def conv2d_nhwc(
         # Infer shape for the rewritten layout
         # todo(merrymercy): wrap this with a more general interface.
         if len(Filter.shape) == 17:
-            # For mali
+            # For mali.
+            # GPU tile structure is SSSRRSRS
+            # You could refer function comment of DoMultiLevelTiling
+            # in the utils.h to see more detail explanation.
             kernel_h = Filter.shape[6] * Filter.shape[9] * Filter.shape[13]
             kernel_w = Filter.shape[7] * Filter.shape[10] * Filter.shape[14]
             channel = Filter.shape[8] * Filter.shape[11] * Filter.shape[15]
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index f6f129795447..93f34609cbbc 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -92,8 +92,8 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
                           max_threads_per_block, max_vthread_extent, warp_size);
   } else if (target->kind->device_type == kDLOpenCL) {
     if (target->GetAttr<String>("device", "") == "mali") {
-      // We can not use device api to get attr like CUDA
-      // because like Mali target is normally on the remote machine
+      // We cannot use device API to get hardware attributes like CUDA,
+      // because like Mali target is normally on the remote machine.
       int max_shared_memory_per_block = 32768;
       int max_local_memory_per_block = INT32_MAX;  // skip the check on local memory
       int max_threads_per_block = 256;
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index 618f3c921a6e..c4f38a71ed9f 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -135,7 +135,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 network = "mobilenet"
 batch_size = 1
 layout = "NHWC"
-# replace this with the device key in your tracker
+# Replace this with the device key in your tracker
 device_key = "rk3399"
 # Set this to True if you use ndk tools for cross compiling
 use_ndk = True
@@ -165,6 +165,30 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 for idx, task in enumerate(tasks):
     print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
     print(task.compute_dag)
+######################################################################
+# .. note:: How to get the hardware parameters from remote device
+#
+#   .. code-block:: python
+#
+#     from tvm.auto_scheduler.utils import request_remote
+#     remote = request_remote(device_key, "0.0.0.0", 9190)
+#     ctx = remote.cl()
+#     max_shared_memory_per_block = ctx.max_shared_memory_per_block
+#     # There is no explicit local memory limition
+#     # so we can use INT32_MAX to disalbe the check on local_memory.
+#     max_local_memory_per_block = 2147483647 # INT32_MAX
+#     max_threads_per_block = ctx.max_threads_per_block
+#     max_vthread_extent = int(ctx.warp_size / 4) if int(ctx.warp_size / 4) > 1 else ctx.warp_size
+#     warp_size = ctx.warp_size
+#     hardware_params = auto_scheduler.HardwareParams(-1, 16, 64,
+#                                                     max_shared_memory_per_block, max_local_memory_per_block,
+#                                                     max_threads_per_block, max_vthread_extent, warp_size)
+#  Now you could pass it to search task and tune
+#
+#   .. code-block:: python
+#
+#     tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host, hardware_params)
+#
 
 #################################################################
 # Tuning and Evaluate

From c0d7e31c855e93efc13a6fa0d240825ae806b902 Mon Sep 17 00:00:00 2001
From: Zhao Wu <zhaowu@apache.org>
Date: Mon, 21 Dec 2020 23:16:13 +0800
Subject: [PATCH 5/5] fix doc

---
 tutorials/auto_scheduler/tune_network_mali.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py
index c4f38a71ed9f..6127963c0c47 100644
--- a/tutorials/auto_scheduler/tune_network_mali.py
+++ b/tutorials/auto_scheduler/tune_network_mali.py
@@ -183,6 +183,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 #     hardware_params = auto_scheduler.HardwareParams(-1, 16, 64,
 #                                                     max_shared_memory_per_block, max_local_memory_per_block,
 #                                                     max_threads_per_block, max_vthread_extent, warp_size)
+#
 #  Now you could pass it to search task and tune
 #
 #   .. code-block:: python