From c511f209095b82eca463e9a7863dffff93068332 Mon Sep 17 00:00:00 2001 From: Zhao Wu Date: Mon, 7 Dec 2020 18:18:16 +0800 Subject: [PATCH 1/5] [Auto Scheduler] Mali Support --- .../tvm/auto_scheduler/relay_integration.py | 8 +- python/tvm/relay/op/strategy/mali.py | 51 +++ python/tvm/topi/mali/conv2d.py | 75 +++- python/tvm/topi/nn/conv2d.py | 10 +- .../search_policy/sketch_policy.cc | 65 ++-- src/auto_scheduler/search_task.cc | 16 + src/relay/backend/build_module.cc | 5 +- tutorials/auto_scheduler/tune_network_mali.py | 328 ++++++++++++++++++ 8 files changed, 512 insertions(+), 46 deletions(-) create mode 100644 tutorials/auto_scheduler/tune_network_mali.py diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 4c493d1d9366..2b26fc4931bd 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -259,9 +259,13 @@ def auto_schedule_topi(outs, has_complex_op): key = register_workload_tensors(dag.hash_key(), io_tensors) - # only enable layout rewrite for cpu backend + # only enable layout rewrite for cpu / mali backend target = tvm.target.Target.current() - enable_layout_rewrite = "cpu" in target.keys + enable_layout_rewrite_targets = ["cpu", "mali"] + enable_layout_rewrite = any( + enable_layout_rewrite_target in target.keys + for enable_layout_rewrite_target in enable_layout_rewrite_targets + ) env = TracingEnvironment.current if env is None: diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index f6ea911a15bf..c3db8980f89c 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -18,8 +18,10 @@ # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import import re from tvm import topi +from tvm.auto_scheduler import is_auto_scheduler_enabled from .generic import * from .. import op as _op +from .cuda import naive_schedule @conv2d_strategy.register("mali") @@ -69,6 +71,36 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): raise RuntimeError( "Unsupported weight layout {} for conv2d NCHW".format(kernel_layout) ) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + if not is_auto_scheduler_enabled(): + logger.error("conv2d NHWC layout is not enabled for mali with autotvm.") + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True), + naive_schedule, + name="conv2d_nhwc.mali", + ) + is_winograd_applicable = False + if len(kernel.shape) == 4: + kernel_h, kernel_w, _, _ = get_const_tuple(kernel.shape) + is_winograd_applicable = ( + "float" in data.dtype + and "float" in kernel.dtype + and kernel_h == 3 + and kernel_w == 3 + and stride_h == 1 + and stride_w == 1 + and dilation_h == 1 + and dilation_w == 1 + ) + if is_winograd_applicable: + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), + naive_schedule, # this implementation should never be picked by autotvm + name="conv2d_nhwc.winograd", + plevel=15, + ) + else: raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -79,6 +111,15 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.mali", ) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + if not is_auto_scheduler_enabled(): + logger.error("depthwise_conv2d NHWC layout is not enabled for mali with autotvm.") + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + naive_schedule, + name="depthwise_conv2d_nhwc.mali", + ) else: raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout)) else: # group_conv2d @@ -105,6 +146,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.mali", ) + elif layout == "NHWC": + if is_auto_scheduler_enabled(): + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform), + naive_schedule, # this implementation should never be picked by autotvm + name="conv2d_nhwc_winograd_without_weight_transform", + plevel=15, + ) + else: + logger.error("AutoTVM doesn't support NHWC winograd on Mali currently") else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout) diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py index eb4005eb37c7..314b21febcd7 100644 --- a/python/tvm/topi/mali/conv2d.py +++ b/python/tvm/topi/mali/conv2d.py @@ -16,6 +16,7 @@ # under the License. # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return """conv2d schedule on ARM Mali GPU""" +import logging import tvm from tvm import te from tvm import relay @@ -25,10 +26,13 @@ from ..utils import traverse_inline, get_const_int, get_const_tuple from .. import nn from ..nn.winograd_util import winograd_transform_matrices +from ..nn.conv2d import conv2d_winograd_nhwc, _conv2d_winograd_nhwc_impl # reuse some compute declarations from ARM CPU from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw +logger = logging.getLogger("topi") + @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.mali") def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): @@ -188,8 +192,12 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec): ##### WINOGRAD TEMPLATE ##### -def _pick_tile_size(data, kernel): - N, CI, H, W = get_const_tuple(data.shape) +def _pick_tile_size(data, kernel, layout="NCHW"): + if layout == "NCHW": + N, CI, H, W = get_const_tuple(data.shape) + else: + assert layout == "NHWC" + N, H, W, CI = get_const_tuple(data.shape) if H % 4 == 0: return 4 @@ -467,13 +475,47 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current - _, outs = relay.backend.compile_engine.select_implementation( + new_attrs = {k: attrs[k] for k in attrs.keys()} + + strides = attrs.get_int_tuple("strides") + padding = attrs.get_int_tuple("padding") + dilation = attrs.get_int_tuple("dilation") + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data, kernel = tinfos + out_dtype = out_type.dtype + + impl, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target ) workload = autotvm.task.get_workload(outs) if workload is None: - # The best implementation is not an AutoTVM template, - # we then assume it's not necessary to alter this op. + # The best implementation is not an AutoTVM template. + # It may be from the auto-scheduler + if impl.name.find("winograd") != -1: + if dilation != (1, 1): + logger.warning("Does not support weight pre-transform for dilated convolution.") + return None + + assert data_layout == "NHWC" and kernel_layout == "HWIO" + N, H, W, CI = get_const_tuple(data.shape) + KH, KW, _, CO = get_const_tuple(kernel.shape) + + # Pre-compute weight transformation in winograd + tile_size = _pick_tile_size(tinfos[0], tinfos[1], layout="NHWC") + + # HWIO -> OIHW + kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 1]) + # alpha, alpha, CO, CI + weight = relay.nn.contrib_conv2d_winograd_weight_transform( + kernel_transform, tile_size=tile_size + ) + new_attrs["tile_size"] = tile_size + new_attrs["channels"] = CO + return relay.nn.contrib_conv2d_winograd_without_weight_transform( + inputs[0], weight, **new_attrs + ) + return None cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None @@ -481,16 +523,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): return None topi_tmpl = workload[0] - new_attrs = {k: attrs[k] for k in attrs.keys()} - - strides = attrs.get_int_tuple("strides") - padding = attrs.get_int_tuple("padding") - dilation = attrs.get_int_tuple("dilation") - data_layout = attrs["data_layout"] - kernel_layout = attrs["kernel_layout"] - data, kernel = tinfos - out_dtype = out_type.dtype - idxd = tvm.tir.indexdiv if topi_tmpl == "conv2d_nchw_spatial_pack.mali": @@ -545,6 +577,19 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): return None +@conv2d_winograd_nhwc.register(["mali"]) +def conv2d_winograd_nhwc_cuda( + data, weight, strides, padding, dilation, out_dtype, pre_computed=False +): + """Conv2D Winograd in NHWC layout. + This is a clean version to be used by the auto-scheduler for mali. + """ + tile_size = _pick_tile_size(data, weight, layout="NHWC") + return _conv2d_winograd_nhwc_impl( + data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed + ) + + ##### SCHECULE UTILITIES ##### def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None): """ tile and bind to GPU threads """ diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 8d591a20839a..8de696cf3fc2 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -382,7 +382,15 @@ def conv2d_nhwc( if auto_scheduler_rewritten_layout: # Infer shape for the rewritten layout # todo(merrymercy): wrap this with a more general interface. - if len(Filter.shape) >= 10: + if len(Filter.shape) == 17: + # For mali + kernel_h = Filter.shape[6] * Filter.shape[9] * Filter.shape[13] + kernel_w = Filter.shape[7] * Filter.shape[10] * Filter.shape[14] + channel = Filter.shape[8] * Filter.shape[11] * Filter.shape[15] + num_filter = Filter.shape[12] * Filter.shape[16] + for i in range(6): + num_filter *= Filter.shape[i] + elif len(Filter.shape) >= 10: # For cpu tile structure SSRSRS base = len(Filter.shape) - 10 kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base] diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc index e81e824626d6..e2678373ef8b 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.cc +++ b/src/auto_scheduler/search_policy/sketch_policy.cc @@ -115,21 +115,35 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model, node->mutation_rules.push_back(std::make_shared(0.01)); } else if (IsGPUTask(node->search_task)) { // Sketch Generation Rules - node->sketch_rules.push_back(&rule_add_cache_read_stage); - node->sketch_rules.push_back(&rule_special_compute_location_gpu); - node->sketch_rules.push_back(&rule_always_inline); - node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor); - node->sketch_rules.push_back(&rule_cross_thread_reduction); - node->sketch_rules.push_back(&rule_add_cache_write_stage); - node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion); - node->sketch_rules.push_back(&rule_multi_level_tiling); - node->sketch_rules.push_back(&rule_skip_stage); + if (node->search_task->target->GetAttr("device", "") == "mali") { + node->sketch_rules.push_back(&rule_always_inline); + node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor); + node->sketch_rules.push_back(&rule_add_rfactor); + node->sketch_rules.push_back(&rule_add_cache_write_stage); + node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion); + node->sketch_rules.push_back(&rule_multi_level_tiling); + node->sketch_rules.push_back(&rule_skip_stage); + } else { + node->sketch_rules.push_back(&rule_add_cache_read_stage); + node->sketch_rules.push_back(&rule_special_compute_location_gpu); + node->sketch_rules.push_back(&rule_always_inline); + node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor); + node->sketch_rules.push_back(&rule_cross_thread_reduction); + node->sketch_rules.push_back(&rule_add_cache_write_stage); + node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion); + node->sketch_rules.push_back(&rule_multi_level_tiling); + node->sketch_rules.push_back(&rule_skip_stage); + } // Initial Population Generation Rules node->init_rules.push_back(&init_fill_tile_size); node->init_rules.push_back(&init_thread_bind); node->init_rules.push_back(&init_unroll); + if (node->search_task->target->GetAttr("device", "") == "mali") { + node->init_rules.push_back(&init_vectorization); + } + // Mutation Rules for Evolutionary Search node->mutation_rules.push_back(std::make_shared(0.90)); node->mutation_rules.push_back(std::make_shared(0.10)); @@ -389,23 +403,22 @@ Array SketchPolicyNode::SampleInitPopulation(const Array& sketches std::vector temp_states(population); // Sample a batch of states randomly - support::parallel_for(0, population, - [this, &temp_states, &sketches, &rand_gens](int index) { - // Randomly choose a sketch - State tmp_s = sketches[(rand_gens[index])() % sketches.size()]; - // Apply random annotation rules one by one - bool valid = true; - for (const auto& rule : init_rules) { - if (rule->Apply(this, &tmp_s, &rand_gens[index]) == - PopulationGenerationRule::ResultKind::kInvalid) { - valid = false; - break; - } - } - if (valid) { - temp_states[index] = std::move(tmp_s); - } - }); + support::parallel_for(0, population, [this, &temp_states, &sketches, &rand_gens](int index) { + // Randomly choose a sketch + State tmp_s = sketches[(rand_gens[index])() % sketches.size()]; + // Apply random annotation rules one by one + bool valid = true; + for (const auto& rule : init_rules) { + if (rule->Apply(this, &tmp_s, &rand_gens[index]) == + PopulationGenerationRule::ResultKind::kInvalid) { + valid = false; + break; + } + } + if (valid) { + temp_states[index] = std::move(tmp_s); + } + }); // Filter out the states that were failed to apply initial rules Array cand_states; diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc index 5a3475542878..f6f129795447 100755 --- a/src/auto_scheduler/search_task.cc +++ b/src/auto_scheduler/search_task.cc @@ -90,6 +90,22 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target int max_vthread_extent = warp_size / 4; return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block, max_threads_per_block, max_vthread_extent, warp_size); + } else if (target->kind->device_type == kDLOpenCL) { + if (target->GetAttr("device", "") == "mali") { + // We can not use device api to get attr like CUDA + // because like Mali target is normally on the remote machine + int max_shared_memory_per_block = 32768; + int max_local_memory_per_block = INT32_MAX; // skip the check on local memory + int max_threads_per_block = 256; + int warp_size = 1; + int max_vthread_extent = 1; + return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block, + max_threads_per_block, max_vthread_extent, warp_size); + } else { + // add other opencl target + auto target_device = target->GetAttr("device", ""); + LOG(FATAL) << "No default hardware parameters for opencl target device: " << target_device; + } } else { LOG(FATAL) << "No default hardware parameters for target: " << target; } diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index a0828d1cac6c..d10a7c307d9a 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -343,8 +343,9 @@ class RelayBuildModule : public runtime::ModuleNode { if (backend::IsAutoSchedulerEnabled() && targets.size() == 1) { const auto& target = (*targets.begin()).second; Pass major_pass = transform::AutoSchedulerLayoutRewrite(); - - if (target->kind->device_type == kDLCPU && pass_ctx.PassEnabled(major_pass->Info())) { + bool enable_layout_rewrite_targets = + target->kind->device_type == kDLCPU || target->GetAttr("device", "") == "mali"; + if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) { With tctx(target); relay_module = major_pass(relay_module); // Defuse ops to fold constants, then fuse them again diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py new file mode 100644 index 000000000000..9ec3fc3969fd --- /dev/null +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -0,0 +1,328 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for mali GPU +============================================ +**Author**: `Zhao Wu `_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for mali GPU with the auto-scheduler. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_runtime +import os + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet50_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +# Define the neural network and compilation target. +network = "mobilenet" +batch_size = 1 +layout = "NHWC" +# replace this with the device key in your tracker +device_key = "rk3399" +# Set this to True if you use ndk tools for cross compiling +use_ndk = True +# Path to cross compiler +os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++" +target_host = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu") +target = tvm.target.Target("opencl -device=mali") +dtype = "float32" +log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) +from tvm.auto_scheduler.utils import ( + call_func_with_timeout, + check_remote, + get_const_tuple, + make_traceback_info, + request_remote, +) + +tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host) + +for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print(task.compute_dag) + +################################################################# +# Tuning and Evaluate +# ------------ +# Now, we set some options for tuning, launch the search tasks and evaluate the end-to-end performance +# +# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. +# You can set it to a small number (e.g., 200) for a fast demonstrative run. +# In practice, we recommend setting it around :code:`800 * len(tasks)`, +# which is typically enough for the search to converge. +# For example, there are 29 tasks in resnet-50, so we can set it as 20000. +# You can adjust this parameter according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRunner` for more parameters. +# + + +def tune_and_evaluate(): + print("Begin tuning...") + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=200, # change this to 20000 to achieve the best performance + builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"), + runner=auto_scheduler.RPCRunner( + device_key, host="0.0.0.0", port=9190, repeat=3, timeout=50 + ), + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option) + + # Compile the whole network + print("Compile...") + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): + lib = relay.build(mod, target=target, target_host=target_host, params=params) + + # Create graph runtime + print("=============== Request Remote ===============") + remote = request_remote(device_key, "0.0.0.0", 9190) + ctx = remote.cl() + from tvm.contrib import utils, ndk + + temp = utils.tempdir() + filename = "deploy_lib.so" + path_lib = temp.relpath(filename) + lib.export_library(path_lib, ndk.create_shared) + remote.upload(path_lib) + loaded_lib = remote.load_module(filename) + module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + data = (np.random.uniform(size=input_shape)).astype(dtype) + data_tvm = tvm.nd.array(data) + module.set_input("data", data_tvm) + + # Evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) + prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond + print( + "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) + ) + + +# We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. + +# tune_and_evaluate() + +###################################################################### +# .. note:: Explain the printed information during tuning +# +# During the tuning, a lot of information will be printed on the console. +# They are used for debugging purposes. The most important info is the output +# of the task scheduler. The following table is a sample output. +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# ------------------------------ [ Task Scheduler ] +# ---------------------------------------------------------------------- +# | ID | Latency (ms) | Speed (GFLOPS) | Trials | +# ------------------------------------------------- +# | 0 | 0.010 | 0.40 | 64 | +# | 1 | 0.087 | 47.19 | 64 | +# | 2 | 0.008 | -0.00 | 64 | +# | 3 | 0.177 | 582.07 | 64 | +# | 4 | 0.268 | 862.37 | 256 | +# | 5 | 0.166 | 621.13 | 128 | +# | 6 | 0.170 | 605.10 | 128 | +# | 7 | 0.128 | 403.20 | 64 | +# | 8 | 0.189 | 545.71 | 64 | +# | 9 | 0.231 | 1001.01 | 448 | +# | 10 | 0.155 | 664.80 | 256 | +# | 11 | 0.155 | 662.86 | 256 | +# | 12 | 0.119 | 434.08 | 64 | +# | 13 | 0.199 | 522.13 | 64 | +# | 14 | 0.235 | 986.56 | 320 | +# | 15 | 0.149 | 689.13 | 128 | +# | 16 | 0.155 | 664.80 | 192 | +# | 17 | 0.151 | 340.64 | 64 | +# | 18 | 0.176 | 597.55 | 128 | +# | 19 | 0.220 | 1054.37 | 192 | +# | 20 | 0.150 | 686.01 | 128 | +# | 21 | 0.159 | 650.88 | 128 | +# | 22 | 0.073 | 358.19 | 64 | +# | 23 | 0.031 | 70.63 | 64 | +# | 24 | 0.251 | 947.73 | 128 | +# | 25 | 0.157 | 652.47 | 128 | +# | 26 | 0.215 | 954.84 | 128 | +# | 27 | 0.237 | 868.92 | 128 | +# | 28 | 0.266 | 774.06 | 128 | +# ------------------------------------------------- +# Estimated total latency: 10.016 ms Trials: 3992 Used time : 1131 s Next ID: 15 +# +# This table lists the latency and (estimated) speed of all tasks. +# It also lists the allocation of measurement trials for all tasks. +# The last line prints the total weighted latency of these tasks, +# which can be a rough estimation of the end-to-end execution time +# of the network. +# The last line also prints the total number of measurement trials, +# total time spent on auto-tuning and the id of the next task to tune. +# +# There will also be some "dmlc::Error"s errors, because the +# auto-scheduler will try some invalid schedules. +# You can safely ignore them if the tuning can continue, because these +# errors are isolated from the main process. +# + +###################################################################### +# .. note:: Terminate the tuning earlier +# +# You can terminate the tuning earlier by forcibly killing this process. +# As long as you get at least one valid schedule for each task in the log file, +# you should be able to do the compilation (the secion below). +# + +################################################################# +# Other Tips +# ---------- +# 1. During the tuning, the auto-scheduler needs to compile many programs and +# extract feature from them. This part is CPU-intensive, +# so a high-performance CPU with many cores is recommended for faster search. +# 2. If you have multiple target CPUs, you can use all of them for measurements to +# parallelize the measurements. Check this :ref:`section ` +# to learn how to use the RPC Tracker and RPC Server. +# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` +# with :any:`auto_scheduler.RPCRunner`. +# From 6c156c6a4770fb4e0dd1aa932e58a635028d917d Mon Sep 17 00:00:00 2001 From: Zhao Wu Date: Fri, 18 Dec 2020 21:41:47 +0800 Subject: [PATCH 2/5] Fix doc --- tutorials/auto_scheduler/tune_network_mali.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index 9ec3fc3969fd..a9449d7ca498 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -16,7 +16,7 @@ # under the License. """ Auto-scheduling a Neural Network for mali GPU -============================================ +============================================= **Author**: `Zhao Wu `_ Auto-tuning for specific devices and workloads is critical for getting the @@ -160,14 +160,6 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): # Extract tasks from the network print("Extract tasks...") mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) -from tvm.auto_scheduler.utils import ( - call_func_with_timeout, - check_remote, - get_const_tuple, - make_traceback_info, - request_remote, -) - tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host) for idx, task in enumerate(tasks): @@ -176,7 +168,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): ################################################################# # Tuning and Evaluate -# ------------ +# ------------------- # Now, we set some options for tuning, launch the search tasks and evaluate the end-to-end performance # # * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. @@ -217,10 +209,10 @@ def tune_and_evaluate(): # Create graph runtime print("=============== Request Remote ===============") + from tvm.auto_scheduler.utils import request_remote remote = request_remote(device_key, "0.0.0.0", 9190) ctx = remote.cl() from tvm.contrib import utils, ndk - temp = utils.tempdir() filename = "deploy_lib.so" path_lib = temp.relpath(filename) @@ -241,7 +233,7 @@ def tune_and_evaluate(): ) -# We do not run the tuning in our webpage server since it takes too long. +# We do not run the tuning in our webpage server since server doesn't have mali gpu. # Uncomment the following line to run it by yourself. # tune_and_evaluate() @@ -320,7 +312,7 @@ def tune_and_evaluate(): # 1. During the tuning, the auto-scheduler needs to compile many programs and # extract feature from them. This part is CPU-intensive, # so a high-performance CPU with many cores is recommended for faster search. -# 2. If you have multiple target CPUs, you can use all of them for measurements to +# 2. If you have multiple target devices, you can use all of them for measurements to # parallelize the measurements. Check this :ref:`section ` # to learn how to use the RPC Tracker and RPC Server. # To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` From 819f1775c5f34e4bf16e5ce3ed819ad806e5cecc Mon Sep 17 00:00:00 2001 From: Zhao Wu Date: Fri, 18 Dec 2020 22:21:26 +0800 Subject: [PATCH 3/5] fix lint --- tutorials/auto_scheduler/tune_network_mali.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index a9449d7ca498..618f3c921a6e 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -210,9 +210,11 @@ def tune_and_evaluate(): # Create graph runtime print("=============== Request Remote ===============") from tvm.auto_scheduler.utils import request_remote + remote = request_remote(device_key, "0.0.0.0", 9190) ctx = remote.cl() from tvm.contrib import utils, ndk + temp = utils.tempdir() filename = "deploy_lib.so" path_lib = temp.relpath(filename) From 38a3668dcc7c94f15e503cc99f2227898b5e2586 Mon Sep 17 00:00:00 2001 From: Zhao Wu Date: Mon, 21 Dec 2020 20:08:24 +0800 Subject: [PATCH 4/5] address comments --- python/tvm/relay/op/strategy/mali.py | 26 +++++++++++-------- python/tvm/topi/mali/conv2d.py | 2 +- python/tvm/topi/nn/conv2d.py | 5 +++- src/auto_scheduler/search_task.cc | 4 +-- tutorials/auto_scheduler/tune_network_mali.py | 26 ++++++++++++++++++- 5 files changed, 47 insertions(+), 16 deletions(-) diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index c3db8980f89c..c4cb4a135e8e 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -21,7 +21,6 @@ from tvm.auto_scheduler import is_auto_scheduler_enabled from .generic import * from .. import op as _op -from .cuda import naive_schedule @conv2d_strategy.register("mali") @@ -74,7 +73,9 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): elif layout == "NHWC": assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): - logger.error("conv2d NHWC layout is not enabled for mali with autotvm.") + raise RuntimeError( + "conv2d NHWC layout is not enabled for mali without auto_scheduler." + ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True), naive_schedule, @@ -114,7 +115,9 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): elif layout == "NHWC": assert kernel_layout == "HWOI" if not is_auto_scheduler_enabled(): - logger.error("depthwise_conv2d NHWC layout is not enabled for mali with autotvm.") + raise RuntimeError( + "depthwise_conv2d NHWC layout is not enabled for mali without auto_scheduler." + ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), naive_schedule, @@ -147,15 +150,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty name="conv2d_nchw_winograd.mali", ) elif layout == "NHWC": - if is_auto_scheduler_enabled(): - strategy.add_implementation( - wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform), - naive_schedule, # this implementation should never be picked by autotvm - name="conv2d_nhwc_winograd_without_weight_transform", - plevel=15, + if not is_auto_scheduler_enabled(): + raise RuntimeError( + "Winograd conv2d NHWC is not enabled for mali without auto_scheduler." ) - else: - logger.error("AutoTVM doesn't support NHWC winograd on Mali currently") + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform), + naive_schedule, # this implementation should never be picked by autotvm + name="conv2d_nhwc_winograd_without_weight_transform", + plevel=15, + ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout) diff --git a/python/tvm/topi/mali/conv2d.py b/python/tvm/topi/mali/conv2d.py index 314b21febcd7..6da3c5f6ec68 100644 --- a/python/tvm/topi/mali/conv2d.py +++ b/python/tvm/topi/mali/conv2d.py @@ -578,7 +578,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): @conv2d_winograd_nhwc.register(["mali"]) -def conv2d_winograd_nhwc_cuda( +def conv2d_winograd_nhwc_mali( data, weight, strides, padding, dilation, out_dtype, pre_computed=False ): """Conv2D Winograd in NHWC layout. diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 8de696cf3fc2..ead9f16a256f 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -383,7 +383,10 @@ def conv2d_nhwc( # Infer shape for the rewritten layout # todo(merrymercy): wrap this with a more general interface. if len(Filter.shape) == 17: - # For mali + # For mali. + # GPU tile structure is SSSRRSRS + # You could refer function comment of DoMultiLevelTiling + # in the utils.h to see more detail explanation. kernel_h = Filter.shape[6] * Filter.shape[9] * Filter.shape[13] kernel_w = Filter.shape[7] * Filter.shape[10] * Filter.shape[14] channel = Filter.shape[8] * Filter.shape[11] * Filter.shape[15] diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc index f6f129795447..93f34609cbbc 100755 --- a/src/auto_scheduler/search_task.cc +++ b/src/auto_scheduler/search_task.cc @@ -92,8 +92,8 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target max_threads_per_block, max_vthread_extent, warp_size); } else if (target->kind->device_type == kDLOpenCL) { if (target->GetAttr("device", "") == "mali") { - // We can not use device api to get attr like CUDA - // because like Mali target is normally on the remote machine + // We cannot use device API to get hardware attributes like CUDA, + // because like Mali target is normally on the remote machine. int max_shared_memory_per_block = 32768; int max_local_memory_per_block = INT32_MAX; // skip the check on local memory int max_threads_per_block = 256; diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index 618f3c921a6e..c4f38a71ed9f 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -135,7 +135,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): network = "mobilenet" batch_size = 1 layout = "NHWC" -# replace this with the device key in your tracker +# Replace this with the device key in your tracker device_key = "rk3399" # Set this to True if you use ndk tools for cross compiling use_ndk = True @@ -165,6 +165,30 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) +###################################################################### +# .. note:: How to get the hardware parameters from remote device +# +# .. code-block:: python +# +# from tvm.auto_scheduler.utils import request_remote +# remote = request_remote(device_key, "0.0.0.0", 9190) +# ctx = remote.cl() +# max_shared_memory_per_block = ctx.max_shared_memory_per_block +# # There is no explicit local memory limition +# # so we can use INT32_MAX to disalbe the check on local_memory. +# max_local_memory_per_block = 2147483647 # INT32_MAX +# max_threads_per_block = ctx.max_threads_per_block +# max_vthread_extent = int(ctx.warp_size / 4) if int(ctx.warp_size / 4) > 1 else ctx.warp_size +# warp_size = ctx.warp_size +# hardware_params = auto_scheduler.HardwareParams(-1, 16, 64, +# max_shared_memory_per_block, max_local_memory_per_block, +# max_threads_per_block, max_vthread_extent, warp_size) +# Now you could pass it to search task and tune +# +# .. code-block:: python +# +# tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, target_host, hardware_params) +# ################################################################# # Tuning and Evaluate From c0d7e31c855e93efc13a6fa0d240825ae806b902 Mon Sep 17 00:00:00 2001 From: Zhao Wu Date: Mon, 21 Dec 2020 23:16:13 +0800 Subject: [PATCH 5/5] fix doc --- tutorials/auto_scheduler/tune_network_mali.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index c4f38a71ed9f..6127963c0c47 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -183,6 +183,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"): # hardware_params = auto_scheduler.HardwareParams(-1, 16, 64, # max_shared_memory_per_block, max_local_memory_per_block, # max_threads_per_block, max_vthread_extent, warp_size) +# # Now you could pass it to search task and tune # # .. code-block:: python