diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 2d7116475bc5..5b666bc9d2e0 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -58,8 +58,10 @@ def evaluate_network(network, target, target_host, number):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
-                        ['resnet-18', 'resnet-34', 'vgg-16',
-                         'mobilenet', 'mobilenet_v2', 'squeezenet v1.0', 'squeezenet v1.1'])
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
     parser.add_argument("--model", type=str, choices=
                         ['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
                          'pixel2', 'rasp3b', 'pynq'], default='rk3399',
@@ -68,7 +70,7 @@ def evaluate_network(network, target, target_host, number):
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--number", type=int, default=6)
+    parser.add_argument("--number", type=int, default=3)
     args = parser.parse_args()
 
     dtype = 'float32'
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index 873e60f82c59..a0eb4a055103 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -17,8 +17,10 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
-        ['resnet-18', 'resnet-34', 'resnet-50', 'vgg-16', 'vgg-19',
-         'inception_v3', 'mobilenet', 'mobilenet_v2', 'densenet-121'])
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
     parser.add_argument("--model", type=str,
                         choices=['1080ti', 'titanx', 'gfx900'], default='1080ti',
                         help="The model of the test device. If your device is not listed in "
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index 8e29fa5dab9a..a75620b3fe08 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -58,8 +58,10 @@ def evaluate_network(network, target, target_host, number):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--network", type=str, choices=
-                        ['resnet-18', 'resnet-34', 'vgg-16',
-                         'mobilenet', 'mobilenet_v2', 'squeezenet v1.1'])
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
     parser.add_argument("--model", type=str, choices=
                         ['rk3399'], default='rk3399',
                         help="The model of the test device. If your device is not listed in "
@@ -67,7 +69,7 @@ def evaluate_network(network, target, target_host, number):
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--number", type=int, default=10)
+    parser.add_argument("--number", type=int, default=30)
     args = parser.parse_args()
 
     dtype = 'float32'
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index bde706ee6cfb..9a309fd5b338 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -20,12 +20,12 @@
 
 # the version of each package
 PACKAGE_VERSION = {
-    'arm_cpu': "v0.01",
+    'arm_cpu': "v0.03",
 
     'cuda':    "v0.02",
     'rocm':    "v0.01",
     'opencl':  "v0.01",
-    'mali':    "v0.01",
+    'mali':    "v0.02",
 
     'vta':     "v0.01",
 }
@@ -38,7 +38,7 @@ def _alias(name):
         'vtacpu': 'vta',
 
         'metal': 'opencl',
-        'nvptx': 'cuda'
+        'nvptx': 'cuda',
     }
     return table.get(name, name)
 
@@ -61,11 +61,12 @@ def context(target, extra_files=None):
     if isinstance(target, str):
         target = _target.create(target)
 
-    possible_names = [str(target).split()[0]]
+    possible_names = []
     for opt in target.options:
         if opt.startswith("-device"):
             device = _alias(opt[8:])
             possible_names.append(device)
+    possible_names.append(target.target_name)
 
     all_packages = list(PACKAGE_VERSION.keys())
     for name in possible_names:
@@ -75,6 +76,7 @@ def context(target, extra_files=None):
 
             filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
             best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
+            break   # only load one file to avoid some fallback template mismatch problem
 
     if extra_files:
         for filename in extra_files:
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 6a924a4b133c..a193e9acf5cb 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -506,8 +506,8 @@ def _callback(op):
 
 
 ##### REGISTER ALTER OP LAYOUT #####
-@conv2d_alter_layout.register(["arm_cpu", "mali"])
-def _alter_conv2d_layout(attrs, inputs, tinfos):
+@conv2d_alter_layout.register(["arm_cpu"])
+def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
     """Alter op layout for pre-computing kernel transformation"""
     import nnvm.symbol as sym
     copy_inputs = [s for s in inputs]
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 6bbf735af18e..d031acdd9a2b 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -9,11 +9,11 @@
 from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
 from ..util import traverse_inline, get_const_int, get_const_tuple, const_matrix
 from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
-    get_pad_tuple, pad
+    get_pad_tuple, pad, conv2d_alter_layout
 
 # reuse some compute declarations from ARM CPU
 from ..arm_cpu.conv2d import _conv_arg_to_workload, _decl_spatial_pack,\
-    _winograd_conv_arg_to_workload
+    _winograd_conv_arg_to_workload, _alter_conv2d_layout_arm
 
 
 @conv2d.register('mali')
@@ -410,6 +410,12 @@ def _schedule_winograd(cfg, s, op):
 
     s[Y].compute_at(s[output], tt)
 
+@conv2d_alter_layout.register(["mali"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    try:
+        return _alter_conv2d_layout_arm(attrs, inputs, tinfos)
+    except KeyError:  # to filter out fallback opencl templates
+        return None
 
 ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
 @conv2d_winograd_without_weight_transform.register(['mali'])
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index f65832a14bdb..14aa0b742a8a 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -69,16 +69,11 @@ def check_device(device):
         np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in get_all_backend():
-        check_device(device)
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def test_conv2d_nchw():
-    # load tophub
-    ctx = autotvm.apply_history_best([])
-    for device in get_all_backend():
-        context = autotvm.tophub.context(device)
-        context.__enter__()
-
     # ResNet18 workloads
     verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
     verify_conv2d_nchw(1,  64,  56,  64, 3, 1, 1)
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 4d3c45763dfb..b03916b9ba09 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -102,7 +102,8 @@ def get_ref_data():
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     for device in get_all_backend():
-        check_device(device)
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
@@ -201,16 +202,11 @@ def get_ref_data():
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     for device in get_all_backend():
-        check_device(device)
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def test_depthwise_conv2d():
-    # load tophub
-    ctx = autotvm.apply_history_best([])
-    for device in get_all_backend():
-        context = autotvm.tophub.context(device)
-        context.__enter__()
-
     # mobilenet workloads
     depthwise_conv2d_with_workload_nchw(1, 32, 112, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 64, 112, 1, 3, 2, "SAME")